diff options
Diffstat (limited to 'Documentation')
38 files changed, 1248 insertions, 560 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb index 7772928..deb6b48 100644 --- a/Documentation/ABI/testing/sysfs-bus-usb +++ b/Documentation/ABI/testing/sysfs-bus-usb @@ -144,3 +144,16 @@ Description: Write a 1 to force the device to disconnect (equivalent to unplugging a wired USB device). + +What: /sys/bus/usb/drivers/.../remove_id +Date: November 2009 +Contact: CHENG Renquan <rqcheng@smu.edu.sg> +Description: + Writing a device ID to this file will remove an ID + that was dynamically added via the new_id sysfs entry. + The format for the device ID is: + idVendor idProduct. After successfully + removing an ID, the driver will no longer support the + device. This is useful to ensure auto probing won't + match the driver to the device. For example: + # echo "046d c315" > /sys/bus/usb/drivers/foo/remove_id diff --git a/Documentation/ABI/testing/sysfs-class-uwb_rc-wusbhc b/Documentation/ABI/testing/sysfs-class-uwb_rc-wusbhc index 4e8106f..25b1e75 100644 --- a/Documentation/ABI/testing/sysfs-class-uwb_rc-wusbhc +++ b/Documentation/ABI/testing/sysfs-class-uwb_rc-wusbhc @@ -23,3 +23,16 @@ Description: Since this relates to security (specifically, the lifetime of PTKs and GTKs) it should not be changed from the default. + +What: /sys/class/uwb_rc/uwbN/wusbhc/wusb_phy_rate +Date: August 2009 +KernelVersion: 2.6.32 +Contact: David Vrabel <david.vrabel@csr.com> +Description: + The maximum PHY rate to use for all connected devices. + This is only of limited use for testing and + development as the hardware's automatic rate + adaptation is better then this simple control. + + Refer to [ECMA-368] section 10.3.1.1 for the value to + use. diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory index 9fe91c0..bf1627b 100644 --- a/Documentation/ABI/testing/sysfs-devices-memory +++ b/Documentation/ABI/testing/sysfs-devices-memory @@ -60,6 +60,19 @@ Description: Users: hotplug memory remove tools https://w3.opensource.ibm.com/projects/powerpc-utils/ + +What: /sys/devices/system/memoryX/nodeY +Date: October 2009 +Contact: Linux Memory Management list <linux-mm@kvack.org> +Description: + When CONFIG_NUMA is enabled, a symbolic link that + points to the corresponding NUMA node directory. + + For example, the following symbolic link is created for + memory section 9 on node0: + /sys/devices/system/memory/memory9/node0 -> ../../node/node0 + + What: /sys/devices/system/node/nodeX/memoryY Date: September 2008 Contact: Gary Hade <garyhade@us.ibm.com> @@ -70,4 +83,3 @@ Description: memory section directory. For example, the following symbolic link is created for memory section 9 on node0. /sys/devices/system/node/node0/memory9 -> ../../memory/memory9 - diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index a703b9e..84a710f 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -62,6 +62,35 @@ Description: CPU topology files that describe kernel limits related to See Documentation/cputopology.txt for more information. +What: /sys/devices/system/cpu/probe + /sys/devices/system/cpu/release +Date: November 2009 +Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> +Description: Dynamic addition and removal of CPU's. This is not hotplug + removal, this is meant complete removal/addition of the CPU + from the system. + + probe: writes to this file will dynamically add a CPU to the + system. Information written to the file to add CPU's is + architecture specific. + + release: writes to this file dynamically remove a CPU from + the system. Information writtento the file to remove CPU's + is architecture specific. + +What: /sys/devices/system/cpu/cpu#/node +Date: October 2009 +Contact: Linux memory management mailing list <linux-mm@kvack.org> +Description: Discover NUMA node a CPU belongs to + + When CONFIG_NUMA is enabled, a symbolic link that points + to the corresponding NUMA node directory. + + For example, the following symlink is created for cpu42 + in NUMA node 2: + + /sys/devices/system/cpu/cpu42/node2 -> ../../node/node2 + What: /sys/devices/system/cpu/cpu#/node Date: October 2009 @@ -136,6 +165,24 @@ Description: Discover cpuidle policy and mechanism See files in Documentation/cpuidle/ for more information. +What: /sys/devices/system/cpu/cpu#/cpufreq/* +Date: pre-git history +Contact: cpufreq@vger.kernel.org +Description: Discover and change clock speed of CPUs + + Clock scaling allows you to change the clock speed of the + CPUs on the fly. This is a nice method to save battery + power, because the lower the clock speed, the less power + the CPU consumes. + + There are many knobs to tweak in this directory. + + See files in Documentation/cpu-freq/ for more information. + + In particular, read Documentation/cpu-freq/user-guide.txt + to learn how to control the knobs. + + What: /sys/devices/system/cpu/cpu*/cache/index*/cache_disable_X Date: August 2008 KernelVersion: 2.6.27 diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab index 6dcf75e..8b093f8 100644 --- a/Documentation/ABI/testing/sysfs-kernel-slab +++ b/Documentation/ABI/testing/sysfs-kernel-slab @@ -45,8 +45,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The alloc_fastpath file is read-only and specifies how many - objects have been allocated using the fast path. + The alloc_fastpath file shows how many objects have been + allocated using the fast path. It can be written to clear the + current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/alloc_from_partial @@ -55,9 +56,10 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The alloc_from_partial file is read-only and specifies how - many times a cpu slab has been full and it has been refilled - by using a slab from the list of partially used slabs. + The alloc_from_partial file shows how many times a cpu slab has + been full and it has been refilled by using a slab from the list + of partially used slabs. It can be written to clear the current + count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/alloc_refill @@ -66,9 +68,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The alloc_refill file is read-only and specifies how many - times the per-cpu freelist was empty but there were objects - available as the result of remote cpu frees. + The alloc_refill file shows how many times the per-cpu freelist + was empty but there were objects available as the result of + remote cpu frees. It can be written to clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/alloc_slab @@ -77,8 +79,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The alloc_slab file is read-only and specifies how many times - a new slab had to be allocated from the page allocator. + The alloc_slab file is shows how many times a new slab had to + be allocated from the page allocator. It can be written to + clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/alloc_slowpath @@ -87,9 +90,10 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The alloc_slowpath file is read-only and specifies how many - objects have been allocated using the slow path because of a - refill or allocation from a partial or new slab. + The alloc_slowpath file shows how many objects have been + allocated using the slow path because of a refill or + allocation from a partial or new slab. It can be written to + clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/cache_dma @@ -117,10 +121,11 @@ KernelVersion: 2.6.31 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The file cpuslab_flush is read-only and specifies how many - times a cache's cpu slabs have been flushed as the result of - destroying or shrinking a cache, a cpu going offline, or as - the result of forcing an allocation from a certain node. + The file cpuslab_flush shows how many times a cache's cpu slabs + have been flushed as the result of destroying or shrinking a + cache, a cpu going offline, or as the result of forcing an + allocation from a certain node. It can be written to clear the + current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/ctor @@ -139,8 +144,8 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The file deactivate_empty is read-only and specifies how many - times an empty cpu slab was deactivated. + The deactivate_empty file shows how many times an empty cpu slab + was deactivated. It can be written to clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/deactivate_full @@ -149,8 +154,8 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The file deactivate_full is read-only and specifies how many - times a full cpu slab was deactivated. + The deactivate_full file shows how many times a full cpu slab + was deactivated. It can be written to clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/deactivate_remote_frees @@ -159,9 +164,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The file deactivate_remote_frees is read-only and specifies how - many times a cpu slab has been deactivated and contained free - objects that were freed remotely. + The deactivate_remote_frees file shows how many times a cpu slab + has been deactivated and contained free objects that were freed + remotely. It can be written to clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/deactivate_to_head @@ -170,9 +175,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The file deactivate_to_head is read-only and specifies how - many times a partial cpu slab was deactivated and added to the - head of its node's partial list. + The deactivate_to_head file shows how many times a partial cpu + slab was deactivated and added to the head of its node's partial + list. It can be written to clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/deactivate_to_tail @@ -181,9 +186,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The file deactivate_to_tail is read-only and specifies how - many times a partial cpu slab was deactivated and added to the - tail of its node's partial list. + The deactivate_to_tail file shows how many times a partial cpu + slab was deactivated and added to the tail of its node's partial + list. It can be written to clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/destroy_by_rcu @@ -201,9 +206,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The file free_add_partial is read-only and specifies how many - times an object has been freed in a full slab so that it had to - added to its node's partial list. + The free_add_partial file shows how many times an object has + been freed in a full slab so that it had to added to its node's + partial list. It can be written to clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/free_calls @@ -222,9 +227,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The free_fastpath file is read-only and specifies how many - objects have been freed using the fast path because it was an - object from the cpu slab. + The free_fastpath file shows how many objects have been freed + using the fast path because it was an object from the cpu slab. + It can be written to clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/free_frozen @@ -233,9 +238,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The free_frozen file is read-only and specifies how many - objects have been freed to a frozen slab (i.e. a remote cpu - slab). + The free_frozen file shows how many objects have been freed to + a frozen slab (i.e. a remote cpu slab). It can be written to + clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/free_remove_partial @@ -244,9 +249,10 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The file free_remove_partial is read-only and specifies how - many times an object has been freed to a now-empty slab so - that it had to be removed from its node's partial list. + The free_remove_partial file shows how many times an object has + been freed to a now-empty slab so that it had to be removed from + its node's partial list. It can be written to clear the current + count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/free_slab @@ -255,8 +261,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The free_slab file is read-only and specifies how many times an - empty slab has been freed back to the page allocator. + The free_slab file shows how many times an empty slab has been + freed back to the page allocator. It can be written to clear + the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/free_slowpath @@ -265,9 +272,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The free_slowpath file is read-only and specifies how many - objects have been freed using the slow path (i.e. to a full or - partial slab). + The free_slowpath file shows how many objects have been freed + using the slow path (i.e. to a full or partial slab). It can + be written to clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/hwcache_align @@ -346,10 +353,10 @@ KernelVersion: 2.6.26 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The file order_fallback is read-only and specifies how many - times an allocation of a new slab has not been possible at the - cache's order and instead fallen back to its minimum possible - order. + The order_fallback file shows how many times an allocation of a + new slab has not been possible at the cache's order and instead + fallen back to its minimum possible order. It can be written to + clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/partial diff --git a/Documentation/arm/OMAP/DSS b/Documentation/arm/OMAP/DSS new file mode 100644 index 0000000..0af0e9e --- /dev/null +++ b/Documentation/arm/OMAP/DSS @@ -0,0 +1,317 @@ +OMAP2/3 Display Subsystem +------------------------- + +This is an almost total rewrite of the OMAP FB driver in drivers/video/omap +(let's call it DSS1). The main differences between DSS1 and DSS2 are DSI, +TV-out and multiple display support, but there are lots of small improvements +also. + +The DSS2 driver (omapdss module) is in arch/arm/plat-omap/dss/, and the FB, +panel and controller drivers are in drivers/video/omap2/. DSS1 and DSS2 live +currently side by side, you can choose which one to use. + +Features +-------- + +Working and tested features include: + +- MIPI DPI (parallel) output +- MIPI DSI output in command mode +- MIPI DBI (RFBI) output +- SDI output +- TV output +- All pieces can be compiled as a module or inside kernel +- Use DISPC to update any of the outputs +- Use CPU to update RFBI or DSI output +- OMAP DISPC planes +- RGB16, RGB24 packed, RGB24 unpacked +- YUV2, UYVY +- Scaling +- Adjusting DSS FCK to find a good pixel clock +- Use DSI DPLL to create DSS FCK + +Tested boards include: +- OMAP3 SDP board +- Beagle board +- N810 + +omapdss driver +-------------- + +The DSS driver does not itself have any support for Linux framebuffer, V4L or +such like the current ones, but it has an internal kernel API that upper level +drivers can use. + +The DSS driver models OMAP's overlays, overlay managers and displays in a +flexible way to enable non-common multi-display configuration. In addition to +modelling the hardware overlays, omapdss supports virtual overlays and overlay +managers. These can be used when updating a display with CPU or system DMA. + +Panel and controller drivers +---------------------------- + +The drivers implement panel or controller specific functionality and are not +usually visible to users except through omapfb driver. They register +themselves to the DSS driver. + +omapfb driver +------------- + +The omapfb driver implements arbitrary number of standard linux framebuffers. +These framebuffers can be routed flexibly to any overlays, thus allowing very +dynamic display architecture. + +The driver exports some omapfb specific ioctls, which are compatible with the +ioctls in the old driver. + +The rest of the non standard features are exported via sysfs. Whether the final +implementation will use sysfs, or ioctls, is still open. + +V4L2 drivers +------------ + +V4L2 is being implemented in TI. + +From omapdss point of view the V4L2 drivers should be similar to framebuffer +driver. + +Architecture +-------------------- + +Some clarification what the different components do: + + - Framebuffer is a memory area inside OMAP's SRAM/SDRAM that contains the + pixel data for the image. Framebuffer has width and height and color + depth. + - Overlay defines where the pixels are read from and where they go on the + screen. The overlay may be smaller than framebuffer, thus displaying only + part of the framebuffer. The position of the overlay may be changed if + the overlay is smaller than the display. + - Overlay manager combines the overlays in to one image and feeds them to + display. + - Display is the actual physical display device. + +A framebuffer can be connected to multiple overlays to show the same pixel data +on all of the overlays. Note that in this case the overlay input sizes must be +the same, but, in case of video overlays, the output size can be different. Any +framebuffer can be connected to any overlay. + +An overlay can be connected to one overlay manager. Also DISPC overlays can be +connected only to DISPC overlay managers, and virtual overlays can be only +connected to virtual overlays. + +An overlay manager can be connected to one display. There are certain +restrictions which kinds of displays an overlay manager can be connected: + + - DISPC TV overlay manager can be only connected to TV display. + - Virtual overlay managers can only be connected to DBI or DSI displays. + - DISPC LCD overlay manager can be connected to all displays, except TV + display. + +Sysfs +----- +The sysfs interface is mainly used for testing. I don't think sysfs +interface is the best for this in the final version, but I don't quite know +what would be the best interfaces for these things. + +The sysfs interface is divided to two parts: DSS and FB. + +/sys/class/graphics/fb? directory: +mirror 0=off, 1=on +rotate Rotation 0-3 for 0, 90, 180, 270 degrees +rotate_type 0 = DMA rotation, 1 = VRFB rotation +overlays List of overlay numbers to which framebuffer pixels go +phys_addr Physical address of the framebuffer +virt_addr Virtual address of the framebuffer +size Size of the framebuffer + +/sys/devices/platform/omapdss/overlay? directory: +enabled 0=off, 1=on +input_size width,height (ie. the framebuffer size) +manager Destination overlay manager name +name +output_size width,height +position x,y +screen_width width +global_alpha global alpha 0-255 0=transparent 255=opaque + +/sys/devices/platform/omapdss/manager? directory: +display Destination display +name +alpha_blending_enabled 0=off, 1=on +trans_key_enabled 0=off, 1=on +trans_key_type gfx-destination, video-source +trans_key_value transparency color key (RGB24) +default_color default background color (RGB24) + +/sys/devices/platform/omapdss/display? directory: +ctrl_name Controller name +mirror 0=off, 1=on +update_mode 0=off, 1=auto, 2=manual +enabled 0=off, 1=on +name +rotate Rotation 0-3 for 0, 90, 180, 270 degrees +timings Display timings (pixclock,xres/hfp/hbp/hsw,yres/vfp/vbp/vsw) + When writing, two special timings are accepted for tv-out: + "pal" and "ntsc" +panel_name +tear_elim Tearing elimination 0=off, 1=on + +There are also some debugfs files at <debugfs>/omapdss/ which show information +about clocks and registers. + +Examples +-------- + +The following definitions have been made for the examples below: + +ovl0=/sys/devices/platform/omapdss/overlay0 +ovl1=/sys/devices/platform/omapdss/overlay1 +ovl2=/sys/devices/platform/omapdss/overlay2 + +mgr0=/sys/devices/platform/omapdss/manager0 +mgr1=/sys/devices/platform/omapdss/manager1 + +lcd=/sys/devices/platform/omapdss/display0 +dvi=/sys/devices/platform/omapdss/display1 +tv=/sys/devices/platform/omapdss/display2 + +fb0=/sys/class/graphics/fb0 +fb1=/sys/class/graphics/fb1 +fb2=/sys/class/graphics/fb2 + +Default setup on OMAP3 SDP +-------------------------- + +Here's the default setup on OMAP3 SDP board. All planes go to LCD. DVI +and TV-out are not in use. The columns from left to right are: +framebuffers, overlays, overlay managers, displays. Framebuffers are +handled by omapfb, and the rest by the DSS. + +FB0 --- GFX -\ DVI +FB1 --- VID1 --+- LCD ---- LCD +FB2 --- VID2 -/ TV ----- TV + +Example: Switch from LCD to DVI +---------------------- + +w=`cat $dvi/timings | cut -d "," -f 2 | cut -d "/" -f 1` +h=`cat $dvi/timings | cut -d "," -f 3 | cut -d "/" -f 1` + +echo "0" > $lcd/enabled +echo "" > $mgr0/display +fbset -fb /dev/fb0 -xres $w -yres $h -vxres $w -vyres $h +# at this point you have to switch the dvi/lcd dip-switch from the omap board +echo "dvi" > $mgr0/display +echo "1" > $dvi/enabled + +After this the configuration looks like: + +FB0 --- GFX -\ -- DVI +FB1 --- VID1 --+- LCD -/ LCD +FB2 --- VID2 -/ TV ----- TV + +Example: Clone GFX overlay to LCD and TV +------------------------------- + +w=`cat $tv/timings | cut -d "," -f 2 | cut -d "/" -f 1` +h=`cat $tv/timings | cut -d "," -f 3 | cut -d "/" -f 1` + +echo "0" > $ovl0/enabled +echo "0" > $ovl1/enabled + +echo "" > $fb1/overlays +echo "0,1" > $fb0/overlays + +echo "$w,$h" > $ovl1/output_size +echo "tv" > $ovl1/manager + +echo "1" > $ovl0/enabled +echo "1" > $ovl1/enabled + +echo "1" > $tv/enabled + +After this the configuration looks like (only relevant parts shown): + +FB0 +-- GFX ---- LCD ---- LCD + \- VID1 ---- TV ---- TV + +Misc notes +---------- + +OMAP FB allocates the framebuffer memory using the OMAP VRAM allocator. + +Using DSI DPLL to generate pixel clock it is possible produce the pixel clock +of 86.5MHz (max possible), and with that you get 1280x1024@57 output from DVI. + +Rotation and mirroring currently only supports RGB565 and RGB8888 modes. VRFB +does not support mirroring. + +VRFB rotation requires much more memory than non-rotated framebuffer, so you +probably need to increase your vram setting before using VRFB rotation. Also, +many applications may not work with VRFB if they do not pay attention to all +framebuffer parameters. + +Kernel boot arguments +--------------------- + +vram=<size> + - Amount of total VRAM to preallocate. For example, "10M". omapfb + allocates memory for framebuffers from VRAM. + +omapfb.mode=<display>:<mode>[,...] + - Default video mode for specified displays. For example, + "dvi:800x400MR-24@60". See drivers/video/modedb.c. + There are also two special modes: "pal" and "ntsc" that + can be used to tv out. + +omapfb.vram=<fbnum>:<size>[@<physaddr>][,...] + - VRAM allocated for a framebuffer. Normally omapfb allocates vram + depending on the display size. With this you can manually allocate + more or define the physical address of each framebuffer. For example, + "1:4M" to allocate 4M for fb1. + +omapfb.debug=<y|n> + - Enable debug printing. You have to have OMAPFB debug support enabled + in kernel config. + +omapfb.test=<y|n> + - Draw test pattern to framebuffer whenever framebuffer settings change. + You need to have OMAPFB debug support enabled in kernel config. + +omapfb.vrfb=<y|n> + - Use VRFB rotation for all framebuffers. + +omapfb.rotate=<angle> + - Default rotation applied to all framebuffers. + 0 - 0 degree rotation + 1 - 90 degree rotation + 2 - 180 degree rotation + 3 - 270 degree rotation + +omapfb.mirror=<y|n> + - Default mirror for all framebuffers. Only works with DMA rotation. + +omapdss.def_disp=<display> + - Name of default display, to which all overlays will be connected. + Common examples are "lcd" or "tv". + +omapdss.debug=<y|n> + - Enable debug printing. You have to have DSS debug support enabled in + kernel config. + +TODO +---- + +DSS locking + +Error checking +- Lots of checks are missing or implemented just as BUG() + +System DMA update for DSI +- Can be used for RGB16 and RGB24P modes. Probably not for RGB24U (how + to skip the empty byte?) + +OMAP1 support +- Not sure if needed + diff --git a/Documentation/cpu-freq/cpu-drivers.txt b/Documentation/cpu-freq/cpu-drivers.txt index 75a58d1..6c30e93 100644 --- a/Documentation/cpu-freq/cpu-drivers.txt +++ b/Documentation/cpu-freq/cpu-drivers.txt @@ -92,9 +92,9 @@ policy->cpuinfo.max_freq - the minimum and maximum frequency (in kHz) which is supported by this CPU policy->cpuinfo.transition_latency the time it takes on this CPU to - switch between two frequencies (if - appropriate, else specify - CPUFREQ_ETERNAL) + switch between two frequencies in + nanoseconds (if appropriate, else + specify CPUFREQ_ETERNAL) policy->cur The current operating frequency of this CPU (if appropriate) diff --git a/Documentation/cpu-freq/user-guide.txt b/Documentation/cpu-freq/user-guide.txt index 2a5b850..04f6b32 100644 --- a/Documentation/cpu-freq/user-guide.txt +++ b/Documentation/cpu-freq/user-guide.txt @@ -203,6 +203,17 @@ scaling_cur_freq : Current frequency of the CPU as determined by the frequency the kernel thinks the CPU runs at. +bios_limit : If the BIOS tells the OS to limit a CPU to + lower frequencies, the user can read out the + maximum available frequency from this file. + This typically can happen through (often not + intended) BIOS settings, restrictions + triggered through a service processor or other + BIOS/HW based implementations. + This does not cover thermal ACPI limitations + which can be detected through the generic + thermal driver. + If you have selected the "userspace" governor which allows you to set the CPU operating frequency to a specific value, you can read out the current frequency in diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt index 9d620c1..4d4a644b 100644 --- a/Documentation/cpu-hotplug.txt +++ b/Documentation/cpu-hotplug.txt @@ -49,6 +49,12 @@ maxcpus=n Restrict boot time cpus to n. Say if you have 4 cpus, using additional_cpus=n (*) Use this to limit hotpluggable cpus. This option sets cpu_possible_map = cpu_present_map + additional_cpus +cede_offline={"off","on"} Use this option to disable/enable putting offlined + processors to an extended H_CEDE state on + supported pseries platforms. + If nothing is specified, + cede_offline is set to "on". + (*) Option valid only for following architectures - ia64 diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 2a4d779..21ab935 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -291,22 +291,6 @@ Who: Michael Buesch <mb@bu3sch.de> --------------------------- -What: usedac i386 kernel parameter -When: 2.6.27 -Why: replaced by allowdac and no dac combination -Who: Glauber Costa <gcosta@redhat.com> - ---------------------------- - -What: print_fn_descriptor_symbol() -When: October 2009 -Why: The %pF vsprintf format provides the same functionality in a - simpler way. print_fn_descriptor_symbol() is deprecated but - still present to give out-of-tree modules time to change. -Who: Bjorn Helgaas <bjorn.helgaas@hp.com> - ---------------------------- - What: /sys/o2cb symlink When: January 2010 Why: /sys/fs/o2cb is the proper location for this information - /sys/o2cb diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index f15621e..7001782 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX @@ -36,6 +36,8 @@ dnotify.txt - info about directory notification in Linux. ecryptfs.txt - docs on eCryptfs: stacked cryptographic filesystem for Linux. +exofs.txt + - info, usage, mount options, design about EXOFS. ext2.txt - info, mount options and specifications for the Ext2 filesystem. ext3.txt diff --git a/Documentation/filesystems/exofs.txt b/Documentation/filesystems/exofs.txt index 0ced74c..abd2a9b 100644 --- a/Documentation/filesystems/exofs.txt +++ b/Documentation/filesystems/exofs.txt @@ -60,13 +60,13 @@ USAGE mkfs.exofs --pid=65536 --format /dev/osd0 - The --format is optional if not specified no OSD_FORMAT will be - preformed and a clean file system will be created in the specified pid, + The --format is optional. If not specified, no OSD_FORMAT will be + performed and a clean file system will be created in the specified pid, in the available space of the target. (Use --format=size_in_meg to limit the total LUN space available) - If pid already exist it will be deleted and a new one will be created in it's - place. Be careful. + If pid already exists, it will be deleted and a new one will be created in + its place. Be careful. An exofs lives inside a single OSD partition. You can create multiple exofs filesystems on the same device using multiple pids. @@ -81,7 +81,7 @@ USAGE 7. For reference (See do-exofs example script): do-exofs start - an example of how to perform the above steps. - do-exofs stop - an example of how to unmount the file system. + do-exofs stop - an example of how to unmount the file system. do-exofs format - an example of how to format and mkfs a new exofs. 8. Extra compilation flags (uncomment in fs/exofs/Kbuild): @@ -104,8 +104,8 @@ Where: exofs specific options: Options are separated by commas (,) pid=<integer> - The partition number to mount/create as container of the filesystem. - This option is mandatory - to=<integer> - Timeout in ticks for a single command + This option is mandatory. + to=<integer> - Timeout in ticks for a single command. default is (60 * HZ) [for debugging only] =============================================================================== @@ -116,7 +116,7 @@ DESIGN with a special ID (defined in common.h). Information included in the file system control block is used to fill the in-memory superblock structure at mount time. This object is created before - the file system is used by mkexofs.c It contains information such as: + the file system is used by mkexofs.c. It contains information such as: - The file system's magic number - The next inode number to be allocated @@ -134,8 +134,8 @@ DESIGN attributes. This applies to both regular files and other types (directories, device files, symlinks, etc.). -* Credentials are generated per object (inode and superblock) when they is - created in memory (read off disk or created). The credential works for all +* Credentials are generated per object (inode and superblock) when they are + created in memory (read from disk or created). The credential works for all operations and is used as long as the object remains in memory. * Async OSD operations are used whenever possible, but the target may execute @@ -145,7 +145,8 @@ DESIGN from executing in reverse order: - The following are handled with the OBJ_CREATED and OBJ_2BCREATED flags. OBJ_CREATED is set when we know the object exists on the OSD - - in create's callback function, and when we successfully do a read_inode. + in create's callback function, and when we successfully do a + read_inode. OBJ_2BCREATED is set in the beginning of the create function, so we know that we should wait. - create/delete: delete should wait until the object is created diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt index 05d5cf1..867c5b50 100644 --- a/Documentation/filesystems/ext3.txt +++ b/Documentation/filesystems/ext3.txt @@ -32,8 +32,8 @@ journal_dev=devnum When the external journal device's major/minor numbers identified through its new major/minor numbers encoded in devnum. -noload Don't load the journal on mounting. Note that this forces - mount of inconsistent filesystem, which can lead to +norecovery Don't load the journal on mounting. Note that this forces +noload mount of inconsistent filesystem, which can lead to various problems. data=journal All data are committed into the journal prior to being diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 6d94e06..af6885c 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -153,8 +153,8 @@ journal_dev=devnum When the external journal device's major/minor numbers identified through its new major/minor numbers encoded in devnum. -noload Don't load the journal on mounting. Note that - if the filesystem was not unmounted cleanly, +norecovery Don't load the journal on mounting. Note that +noload if the filesystem was not unmounted cleanly, skipping the journal replay will lead to the filesystem containing inconsistencies that can lead to any number of problems. @@ -353,6 +353,12 @@ noauto_da_alloc replacing existing files via patterns such as system crashes before the delayed allocation blocks are forced to disk. +discard Controls whether ext4 should issue discard/TRIM +nodiscard(*) commands to the underlying block device when + blocks are freed. This is useful for SSD devices + and sparse/thinly-provisioned LUNs, but it is off + by default until sufficient testing has been done. + Data Mode ========= There are 3 different data modes: diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt index 01539f4..4949fca 100644 --- a/Documentation/filesystems/nilfs2.txt +++ b/Documentation/filesystems/nilfs2.txt @@ -49,8 +49,7 @@ Mount options NILFS2 supports the following mount options: (*) == default -barrier=on(*) This enables/disables barriers. barrier=off disables - it, barrier=on enables it. +nobarrier Disables barriers. errors=continue(*) Keep going on a filesystem error. errors=remount-ro Remount the filesystem read-only on an error. errors=panic Panic and halt the machine if an error occurs. @@ -71,6 +70,10 @@ order=strict Apply strict in-order semantics that preserves sequence blocks. That means, it is guaranteed that no overtaking of events occurs in the recovered file system after a crash. +norecovery Disable recovery of the filesystem on mount. + This disables every write access on the device for + read-only mounts or snapshots. This option will fail + for r/w mounts on an unclean volume. NILFS2 usage ============ diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 94b9f20..220cc63 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -38,6 +38,7 @@ Table of Contents 3.3 /proc/<pid>/io - Display the IO accounting fields 3.4 /proc/<pid>/coredump_filter - Core dump filtering settings 3.5 /proc/<pid>/mountinfo - Information about mounts + 3.6 /proc/<pid>/comm & /proc/<pid>/task/<tid>/comm ------------------------------------------------------------------------------ @@ -1409,3 +1410,11 @@ For more information on mount propagation see: Documentation/filesystems/sharedsubtree.txt + +3.6 /proc/<pid>/comm & /proc/<pid>/task/<tid>/comm +-------------------------------------------------------- +These files provide a method to access a tasks comm value. It also allows for +a task to set its own or one of its thread siblings comm value. The comm value +is limited in size compared to the cmdline value, so writing anything longer +then the kernel's TASK_COMM_LEN (currently 16 chars) will result in a truncated +comm value. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 623f094..3de2f32 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -472,7 +472,7 @@ __sync_single_inode) to check if ->writepages has been successful in writing out the whole address_space. The Writeback tag is used by filemap*wait* and sync_page* functions, -via wait_on_page_writeback_range, to wait for all writeback to +via filemap_fdatawait_range, to wait for all writeback to complete. While waiting ->sync_page (if defined) will be called on each page that is found to require writeback. diff --git a/Documentation/hwmon/lis3lv02d b/Documentation/hwmon/lis3lv02d index effe949..06534f2 100644 --- a/Documentation/hwmon/lis3lv02d +++ b/Documentation/hwmon/lis3lv02d @@ -3,7 +3,8 @@ Kernel driver lis3lv02d Supported chips: - * STMicroelectronics LIS3LV02DL and LIS3LV02DQ + * STMicroelectronics LIS3LV02DL, LIS3LV02DQ (12 bits precision) + * STMicroelectronics LIS302DL, LIS3L02DQ, LIS331DL (8 bits) Authors: Yan Burman <burman.yan@gmail.com> @@ -13,32 +14,52 @@ Authors: Description ----------- -This driver provides support for the accelerometer found in various HP -laptops sporting the feature officially called "HP Mobile Data -Protection System 3D" or "HP 3D DriveGuard". It detects automatically -laptops with this sensor. Known models (for now the HP 2133, nc6420, -nc2510, nc8510, nc84x0, nw9440 and nx9420) will have their axis -automatically oriented on standard way (eg: you can directly play -neverball). The accelerometer data is readable via -/sys/devices/platform/lis3lv02d. +This driver provides support for the accelerometer found in various HP laptops +sporting the feature officially called "HP Mobile Data Protection System 3D" or +"HP 3D DriveGuard". It detects automatically laptops with this sensor. Known +models (full list can be found in drivers/hwmon/hp_accel.c) will have their +axis automatically oriented on standard way (eg: you can directly play +neverball). The accelerometer data is readable via +/sys/devices/platform/lis3lv02d. Reported values are scaled +to mg values (1/1000th of earth gravity). Sysfs attributes under /sys/devices/platform/lis3lv02d/: position - 3D position that the accelerometer reports. Format: "(x,y,z)" -calibrate - read: values (x, y, z) that are used as the base for input - class device operation. - write: forces the base to be recalibrated with the current - position. -rate - reports the sampling rate of the accelerometer device in HZ +rate - read reports the sampling rate of the accelerometer device in HZ. + write changes sampling rate of the accelerometer device. + Only values which are supported by HW are accepted. +selftest - performs selftest for the chip as specified by chip manufacturer. This driver also provides an absolute input class device, allowing -the laptop to act as a pinball machine-esque joystick. +the laptop to act as a pinball machine-esque joystick. Joystick device can be +calibrated. Joystick device can be in two different modes. +By default output values are scaled between -32768 .. 32767. In joystick raw +mode, joystick and sysfs position entry have the same scale. There can be +small difference due to input system fuzziness feature. +Events are also available as input event device. + +Selftest is meant only for hardware diagnostic purposes. It is not meant to be +used during normal operations. Position data is not corrupted during selftest +but interrupt behaviour is not guaranteed to work reliably. In test mode, the +sensing element is internally moved little bit. Selftest measures difference +between normal mode and test mode. Chip specifications tell the acceptance +limit for each type of the chip. Limits are provided via platform data +to allow adjustment of the limits without a change to the actual driver. +Seltest returns either "OK x y z" or "FAIL x y z" where x, y and z are +measured difference between modes. Axes are not remapped in selftest mode. +Measurement values are provided to help HW diagnostic applications to make +final decision. + +On HP laptops, if the led infrastructure is activated, support for a led +indicating disk protection will be provided as /sys/class/leds/hp::hddprotect. Another feature of the driver is misc device called "freefall" that acts similar to /dev/rtc and reacts on free-fall interrupts received from the device. It supports blocking operations, poll/select and fasync operation modes. You must read 1 bytes from the device. The result is number of free-fall interrupts since the last successful -read (or 255 if number of interrupts would not fit). +read (or 255 if number of interrupts would not fit). See the hpfall.c +file for an example on using the device. Axes orientation @@ -55,7 +76,7 @@ the accelerometer are converted into a "standard" organisation of the axes * If the laptop is put upside-down, Z becomes negative If your laptop model is not recognized (cf "dmesg"), you can send an -email to the authors to add it to the database. When reporting a new +email to the maintainer to add it to the database. When reporting a new laptop, please include the output of "dmidecode" plus the value of /sys/devices/platform/lis3lv02d/position in these four cases. diff --git a/Documentation/hwmon/w83627ehf b/Documentation/hwmon/w83627ehf index 02b7489..b7e42ec 100644 --- a/Documentation/hwmon/w83627ehf +++ b/Documentation/hwmon/w83627ehf @@ -81,8 +81,14 @@ pwm[1-4] - this file stores PWM duty cycle or DC value (fan speed) in range: 0 (stop) to 255 (full) pwm[1-4]_enable - this file controls mode of fan/temperature control: - * 1 Manual Mode, write to pwm file any value 0-255 (full speed) - * 2 Thermal Cruise + * 1 Manual mode, write to pwm file any value 0-255 (full speed) + * 2 "Thermal Cruise" mode + * 3 "Fan Speed Cruise" mode + * 4 "Smart Fan III" mode + +pwm[1-4]_mode - controls if output is PWM or DC level + * 0 DC output (0 - 12v) + * 1 PWM output Thermal Cruise mode ------------------- diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients index 7860aaf..0a74603 100644 --- a/Documentation/i2c/writing-clients +++ b/Documentation/i2c/writing-clients @@ -44,7 +44,7 @@ static struct i2c_driver foo_driver = { /* if device autodetection is needed: */ .class = I2C_CLASS_SOMETHING, .detect = foo_detect, - .address_data = &addr_data, + .address_list = normal_i2c, .shutdown = foo_shutdown, /* optional */ .suspend = foo_suspend, /* optional */ diff --git a/Documentation/isdn/README.gigaset b/Documentation/isdn/README.gigaset index 0fc9831..794941f 100644 --- a/Documentation/isdn/README.gigaset +++ b/Documentation/isdn/README.gigaset @@ -68,22 +68,38 @@ GigaSet 307x Device Driver for troubleshooting or to pass module parameters. The module ser_gigaset provides a serial line discipline N_GIGASET_M101 - which drives the device through the regular serial line driver. It must - be attached to the serial line to which the M101 is connected with the - ldattach(8) command (requires util-linux-ng release 2.14 or later), for - example: - ldattach GIGASET_M101 /dev/ttyS1 + which uses the regular serial port driver to access the device, and must + therefore be attached to the serial device to which the M101 is connected. + The ldattach(8) command (included in util-linux-ng release 2.14 or later) + can be used for that purpose, for example: + ldattach GIGASET_M101 /dev/ttyS1 This will open the device file, attach the line discipline to it, and then sleep in the background, keeping the device open so that the line discipline remains active. To deactivate it, kill the daemon, for example with - killall ldattach + killall ldattach before disconnecting the device. To have this happen automatically at system startup/shutdown on an LSB compatible system, create and activate an appropriate LSB startup script /etc/init.d/gigaset. (The init name 'gigaset' is officially assigned to this project by LANANA.) Alternatively, just add the 'ldattach' command line to /etc/rc.local. + The modules accept the following parameters: + + Module Parameter Meaning + + gigaset debug debug level (see section 3.2.) + + startmode initial operation mode (see section 2.5.): + bas_gigaset ) 1=ISDN4linux/CAPI (default), 0=Unimodem + ser_gigaset ) + usb_gigaset ) cidmode initial Call-ID mode setting (see section + 2.5.): 1=on (default), 0=off + + Depending on your distribution you may want to create a separate module + configuration file /etc/modprobe.d/gigaset for these, or add them to a + custom file like /etc/modprobe.conf.local. + 2.2. Device nodes for user space programs ------------------------------------ The device can be accessed from user space (eg. by the user space tools @@ -93,11 +109,48 @@ GigaSet 307x Device Driver - /dev/ttyGU0 for M105 (USB data boxes) - /dev/ttyGB0 for the base driver (direct USB connection) - You can also select a "default device" which is used by the frontends when + If you connect more than one device of a type, they will get consecutive + device nodes, eg. /dev/ttyGU1 for a second M105. + + You can also set a "default device" for the user space tools to use when no device node is given as parameter, by creating a symlink /dev/ttyG to one of them, eg.: - ln -s /dev/ttyGB0 /dev/ttyG + ln -s /dev/ttyGB0 /dev/ttyG + + The devices accept the following device specific ioctl calls + (defined in gigaset_dev.h): + + ioctl(int fd, GIGASET_REDIR, int *cmd); + If cmd==1, the device is set to be controlled exclusively through the + character device node; access from the ISDN subsystem is blocked. + If cmd==0, the device is set to be used from the ISDN subsystem and does + not communicate through the character device node. + + ioctl(int fd, GIGASET_CONFIG, int *cmd); + (ser_gigaset and usb_gigaset only) + If cmd==1, the device is set to adapter configuration mode where commands + are interpreted by the M10x DECT adapter itself instead of being + forwarded to the base station. In this mode, the device accepts the + commands described in Siemens document "AT-Kommando Alignment M10x Data" + for setting the operation mode, associating with a base station and + querying parameters like field strengh and signal quality. + Note that there is no ioctl command for leaving adapter configuration + mode and returning to regular operation. In order to leave adapter + configuration mode, write the command ATO to the device. + + ioctl(int fd, GIGASET_BRKCHARS, unsigned char brkchars[6]); + (usb_gigaset only) + Set the break characters on an M105's internal serial adapter to the six + bytes stored in brkchars[]. Unused bytes should be set to zero. + + ioctl(int fd, GIGASET_VERSION, unsigned version[4]); + Retrieve version information from the driver. version[0] must be set to + one of: + - GIGVER_DRIVER: retrieve driver version + - GIGVER_COMPAT: retrieve interface compatibility version + - GIGVER_FWBASE: retrieve the firmware version of the base + Upon return, version[] is filled with the requested version information. 2.3. ISDN4linux ---------- @@ -113,15 +166,24 @@ GigaSet 307x Device Driver Connection State: 0, Response: -1 gigaset_process_response: resp_code -1 in ConState 0 ! Timeout occurred - you might need to use unimodem mode. (see section 2.5.) + you probably need to use unimodem mode. (see section 2.5.) 2.4. CAPI ---- If the driver is compiled with CAPI support (kernel configuration option GIGASET_CAPI, experimental) it can also be used with CAPI 2.0 kernel and - user space applications. ISDN4Linux is supported in this configuration + user space applications. For user space access, the module capi.ko must + be loaded. The capiinit command (included in the capi4k-utils package) + does this for you. + + The CAPI variant of the driver supports legacy ISDN4Linux applications via the capidrv compatibility driver. The kernel module capidrv.ko must - be loaded explicitly ("modprobe capidrv") if needed. + be loaded explicitly with the command + modprobe capidrv + if needed, and cannot be unloaded again without unloading the driver + first. (These are limitations of capidrv.) + + The note about unimodem mode in the preceding section applies here, too. 2.5. Unimodem mode ------------- @@ -134,9 +196,14 @@ GigaSet 307x Device Driver You can switch back using gigacontr --mode isdn - You can also load the driver using e.g. - modprobe usb_gigaset startmode=0 - to prevent the driver from starting in "isdn4linux mode". + You can also put the driver directly into Unimodem mode when it's loaded, + by passing the module parameter startmode=0 to the hardware specific + module, e.g. + modprobe usb_gigaset startmode=0 + or by adding a line like + options usb_gigaset startmode=0 + to an appropriate module configuration file, like /etc/modprobe.d/gigaset + or /etc/modprobe.conf.local. In this mode the device works like a modem connected to a serial port (the /dev/ttyGU0, ... mentioned above) which understands the commands @@ -164,9 +231,8 @@ GigaSet 307x Device Driver options ppp_async flag_time=0 - to /etc/modprobe.conf. If your distribution has some local module - configuration file like /etc/modprobe.conf.local, - using that should be preferred. + to an appropriate module configuration file, like /etc/modprobe.d/gigaset + or /etc/modprobe.conf.local. 2.6. Call-ID (CID) mode ------------------ @@ -189,12 +255,13 @@ GigaSet 307x Device Driver settings (CID mode). - If you have several DECT data devices (M10x) which you want to use in turn, select Unimodem mode by passing the parameter "cidmode=0" to - the driver ("modprobe usb_gigaset cidmode=0" or modprobe.conf). + the appropriate driver module (ser_gigaset or usb_gigaset). If you want both of these at once, you are out of luck. - You can also use /sys/class/tty/ttyGxy/cidmode for changing the CID mode - setting (ttyGxy is ttyGU0 or ttyGB0). + You can also use the tty class parameter "cidmode" of the device to + change its CID mode while the driver is loaded, eg. + echo 0 > /sys/class/tty/ttyGU0/cidmode 2.7. Unregistered Wireless Devices (M101/M105) ----------------------------------------- @@ -208,7 +275,7 @@ GigaSet 307x Device Driver driver. In that situation, a restricted set of functions is available which includes, in particular, those necessary for registering the device to a base or for switching it between Fixed Part and Portable Part - modes. + modes. See the gigacontr(8) manpage for details. 3. Troubleshooting --------------- @@ -222,9 +289,7 @@ GigaSet 307x Device Driver options isdn dialtimeout=15 - to /etc/modprobe.conf. If your distribution has some local module - configuration file like /etc/modprobe.conf.local, - using that should be preferred. + to /etc/modprobe.d/gigaset, /etc/modprobe.conf.local or a similar file. Problem: Your isdn script aborts with a message about isdnlog. @@ -264,7 +329,8 @@ GigaSet 307x Device Driver The initial value can be set using the debug parameter when loading the module "gigaset", e.g. by adding a line options gigaset debug=0 - to /etc/modprobe.conf, ... + to your module configuration file, eg. /etc/modprobe.d/gigaset or + /etc/modprobe.conf.local. Generated debugging information can be found - as output of the command diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 777dc8a..ab95d3a 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1787,6 +1787,11 @@ and is between 256 and 4096 characters. It is defined in the file waiting for the ACK, so if this is set too high interrupts *may* be lost! + omap_mux= [OMAP] Override bootloader pin multiplexing. + Format: <mux_mode0.mode_name=value>... + For example, to override I2C bus2: + omap_mux=i2c2_scl.i2c2_scl=0x100,i2c2_sda.i2c2_sda=0x100 + opl3= [HW,OSS] Format: <io> @@ -2663,6 +2668,8 @@ and is between 256 and 4096 characters. It is defined in the file to a common usb-storage quirk flag as follows: a = SANE_SENSE (collect more than 18 bytes of sense data); + b = BAD_SENSE (don't collect more than 18 + bytes of sense data); c = FIX_CAPACITY (decrease the reported device capacity by one sector); h = CAPACITY_HEURISTICS (decrease the diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt index 9cb9138..65f4c79 100644 --- a/Documentation/lockstat.txt +++ b/Documentation/lockstat.txt @@ -62,8 +62,20 @@ applicable). It also tracks 4 contention points per class. A contention point is a call site that had to wait on lock acquisition. + - CONFIGURATION + +Lock statistics are enabled via CONFIG_LOCK_STATS. + - USAGE +Enable collection of statistics: + +# echo 1 >/proc/sys/kernel/lock_stat + +Disable collection of statistics: + +# echo 0 >/proc/sys/kernel/lock_stat + Look at the current lock statistics: ( line numbers not part of actual output, done for clarity in the explanation diff --git a/Documentation/md.txt b/Documentation/md.txt index 4edd39e..188f476 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt @@ -233,9 +233,9 @@ All md devices contain: resync_start The point at which resync should start. If no resync is needed, - this will be a very large number. At array creation it will - default to 0, though starting the array as 'clean' will - set it much larger. + this will be a very large number (or 'none' since 2.6.30-rc1). At + array creation it will default to 0, though starting the array as + 'clean' will set it much larger. new_dev This file can be written but not read. The value written should @@ -296,6 +296,51 @@ All md devices contain: active-idle like active, but no writes have been seen for a while (safe_mode_delay). + bitmap/location + This indicates where the write-intent bitmap for the array is + stored. + It can be one of "none", "file" or "[+-]N". + "file" may later be extended to "file:/file/name" + "[+-]N" means that many sectors from the start of the metadata. + This is replicated on all devices. For arrays with externally + managed metadata, the offset is from the beginning of the + device. + bitmap/chunksize + The size, in bytes, of the chunk which will be represented by a + single bit. For RAID456, it is a portion of an individual + device. For RAID10, it is a portion of the array. For RAID1, it + is both (they come to the same thing). + bitmap/time_base + The time, in seconds, between looking for bits in the bitmap to + be cleared. In the current implementation, a bit will be cleared + between 2 and 3 times "time_base" after all the covered blocks + are known to be in-sync. + bitmap/backlog + When write-mostly devices are active in a RAID1, write requests + to those devices proceed in the background - the filesystem (or + other user of the device) does not have to wait for them. + 'backlog' sets a limit on the number of concurrent background + writes. If there are more than this, new writes will by + synchronous. + bitmap/metadata + This can be either 'internal' or 'external'. + 'internal' is the default and means the metadata for the bitmap + is stored in the first 256 bytes of the allocated space and is + managed by the md module. + 'external' means that bitmap metadata is managed externally to + the kernel (i.e. by some userspace program) + bitmap/can_clear + This is either 'true' or 'false'. If 'true', then bits in the + bitmap will be cleared when the corresponding blocks are thought + to be in-sync. If 'false', bits will never be cleared. + This is automatically set to 'false' if a write happens on a + degraded array, or if the array becomes degraded during a write. + When metadata is managed externally, it should be set to true + once the array becomes non-degraded, and this fact has been + recorded in the metadata. + + + As component devices are added to an md array, they appear in the 'md' directory as new directories named @@ -334,8 +379,9 @@ Each directory contains: Writing "writemostly" sets the writemostly flag. Writing "-writemostly" clears the writemostly flag. Writing "blocked" sets the "blocked" flag. - Writing "-blocked" clear the "blocked" flag and allows writes + Writing "-blocked" clears the "blocked" flag and allows writes to complete. + Writing "in_sync" sets the in_sync flag. This file responds to select/poll. Any change to 'faulty' or 'blocked' causes an event. @@ -372,6 +418,24 @@ Each directory contains: array. If a value less than the current component_size is written, it will be rejected. + recovery_start + + When the device is not 'in_sync', this records the number of + sectors from the start of the device which are known to be + correct. This is normally zero, but during a recovery + operation is will steadily increase, and if the recovery is + interrupted, restoring this value can cause recovery to + avoid repeating the earlier blocks. With v1.x metadata, this + value is saved and restored automatically. + + This can be set whenever the device is not an active member of + the array, either before the array is activated, or before + the 'slot' is set. + + Setting this to 'none' is equivalent to setting 'in_sync'. + Setting to any other value also clears the 'in_sync' flag. + + An active md device will also contain and entry for each active device in the array. These are named diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index bbc8a6a..57e7e9c 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt @@ -160,12 +160,15 @@ Under each section, you can see 4 files. NOTE: These directories/files appear after physical memory hotplug phase. -If CONFIG_NUMA is enabled the -/sys/devices/system/memory/memoryXXX memory section -directories can also be accessed via symbolic links located in -the /sys/devices/system/node/node* directories. For example: +If CONFIG_NUMA is enabled the memoryXXX/ directories can also be accessed +via symbolic links located in the /sys/devices/system/node/node* directories. + +For example: /sys/devices/system/node/node0/memory9 -> ../../memory/memory9 +A backlink will also be created: +/sys/devices/system/memory/memory9/node0 -> ../../node/node0 + -------------------------------- 4. Physical memory hot-add phase -------------------------------- diff --git a/Documentation/misc-devices/ad525x_dpot.txt b/Documentation/misc-devices/ad525x_dpot.txt new file mode 100644 index 0000000..0c9413b --- /dev/null +++ b/Documentation/misc-devices/ad525x_dpot.txt @@ -0,0 +1,57 @@ +--------------------------------- + AD525x Digital Potentiometers +--------------------------------- + +The ad525x_dpot driver exports a simple sysfs interface. This allows you to +work with the immediate resistance settings as well as update the saved startup +settings. Access to the factory programmed tolerance is also provided, but +interpretation of this settings is required by the end application according to +the specific part in use. + +--------- + Files +--------- + +Each dpot device will have a set of eeprom, rdac, and tolerance files. How +many depends on the actual part you have, as will the range of allowed values. + +The eeprom files are used to program the startup value of the device. + +The rdac files are used to program the immediate value of the device. + +The tolerance files are the read-only factory programmed tolerance settings +and may vary greatly on a part-by-part basis. For exact interpretation of +this field, please consult the datasheet for your part. This is presented +as a hex file for easier parsing. + +----------- + Example +----------- + +Locate the device in your sysfs tree. This is probably easiest by going into +the common i2c directory and locating the device by the i2c slave address. + + # ls /sys/bus/i2c/devices/ + 0-0022 0-0027 0-002f + +So assuming the device in question is on the first i2c bus and has the slave +address of 0x2f, we descend (unrelated sysfs entries have been trimmed). + + # ls /sys/bus/i2c/devices/0-002f/ + eeprom0 rdac0 tolerance0 + +You can use simple reads/writes to access these files: + + # cd /sys/bus/i2c/devices/0-002f/ + + # cat eeprom0 + 0 + # echo 10 > eeprom0 + # cat eeprom0 + 10 + + # cat rdac0 + 5 + # echo 3 > rdac0 + # cat rdac0 + 3 diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt index b565e82..8e1ddec 100644 --- a/Documentation/nommu-mmap.txt +++ b/Documentation/nommu-mmap.txt @@ -119,6 +119,32 @@ FURTHER NOTES ON NO-MMU MMAP granule but will only discard the excess if appropriately configured as this has an effect on fragmentation. + (*) The memory allocated by a request for an anonymous mapping will normally + be cleared by the kernel before being returned in accordance with the + Linux man pages (ver 2.22 or later). + + In the MMU case this can be achieved with reasonable performance as + regions are backed by virtual pages, with the contents only being mapped + to cleared physical pages when a write happens on that specific page + (prior to which, the pages are effectively mapped to the global zero page + from which reads can take place). This spreads out the time it takes to + initialize the contents of a page - depending on the write-usage of the + mapping. + + In the no-MMU case, however, anonymous mappings are backed by physical + pages, and the entire map is cleared at allocation time. This can cause + significant delays during a userspace malloc() as the C library does an + anonymous mapping and the kernel then does a memset for the entire map. + + However, for memory that isn't required to be precleared - such as that + returned by malloc() - mmap() can take a MAP_UNINITIALIZED flag to + indicate to the kernel that it shouldn't bother clearing the memory before + returning it. Note that CONFIG_MMAP_ALLOW_UNINITIALIZED must be enabled + to permit this, otherwise the flag will be ignored. + + uClibc uses this to speed up malloc(), and the ELF-FDPIC binfmt uses this + to allocate the brk and stack region. + (*) A list of all the private copy and anonymous mappings on the system is visible through /proc/maps in no-MMU mode. diff --git a/Documentation/powerpc/dts-bindings/fsl/board.txt b/Documentation/powerpc/dts-bindings/fsl/board.txt index e8b5bc2..39e9415 100644 --- a/Documentation/powerpc/dts-bindings/fsl/board.txt +++ b/Documentation/powerpc/dts-bindings/fsl/board.txt @@ -20,12 +20,16 @@ Required properities: - compatible : should be "fsl,fpga-pixis". - reg : should contain the address and the length of the FPPGA register set. +- interrupt-parent: should specify phandle for the interrupt controller. +- interrupts : should specify event (wakeup) IRQ. Example (MPC8610HPCD): board-control@e8000000 { compatible = "fsl,fpga-pixis"; reg = <0xe8000000 32>; + interrupt-parent = <&mpic>; + interrupts = <8 8>; }; * Freescale BCSR GPIO banks diff --git a/Documentation/powerpc/dts-bindings/fsl/mpc5200.txt b/Documentation/powerpc/dts-bindings/fsl/mpc5200.txt index cabc780..5c6602d 100644 --- a/Documentation/powerpc/dts-bindings/fsl/mpc5200.txt +++ b/Documentation/powerpc/dts-bindings/fsl/mpc5200.txt @@ -103,7 +103,22 @@ fsl,mpc5200-gpt nodes --------------------- On the mpc5200 and 5200b, GPT0 has a watchdog timer function. If the board design supports the internal wdt, then the device node for GPT0 should -include the empty property 'fsl,has-wdt'. +include the empty property 'fsl,has-wdt'. Note that this does not activate +the watchdog. The timer will function as a GPT if the timer api is used, and +it will function as watchdog if the watchdog device is used. The watchdog +mode has priority over the gpt mode, i.e. if the watchdog is activated, any +gpt api call to this timer will fail with -EBUSY. + +If you add the property + fsl,wdt-on-boot = <n>; +GPT0 will be marked as in-use watchdog, i.e. blocking every gpt access to it. +If n>0, the watchdog is started with a timeout of n seconds. If n=0, the +configuration of the watchdog is not touched. This is useful in two cases: +- just mark GPT0 as watchdog, blocking gpt accesses, and configure it later; +- do not touch a configuration assigned by the boot loader which supervises + the boot process itself. + +The watchdog will respect the CONFIG_WATCHDOG_NOWAYOUT option. An mpc5200-gpt can be used as a single line GPIO controller. To do so, add the following properties to the gpt node: diff --git a/Documentation/powerpc/dts-bindings/xilinx.txt b/Documentation/powerpc/dts-bindings/xilinx.txt index 80339fe..ea68046 100644 --- a/Documentation/powerpc/dts-bindings/xilinx.txt +++ b/Documentation/powerpc/dts-bindings/xilinx.txt @@ -292,4 +292,15 @@ - reg-offset : A value of 3 is required - reg-shift : A value of 2 is required + vii) Xilinx USB Host controller + + The Xilinx USB host controller is EHCI compatible but with a different + base address for the EHCI registers, and it is always a big-endian + USB Host controller. The hardware can be configured as high speed only, + or high speed/full speed hybrid. + + Required properties: + - xlnx,support-usb-fs: A value 0 means the core is built as high speed + only. A value 1 means the core also supports + full speed devices. diff --git a/Documentation/serial/hayes-esp.txt b/Documentation/serial/hayes-esp.txt deleted file mode 100644 index 09b5d58..0000000 --- a/Documentation/serial/hayes-esp.txt +++ /dev/null @@ -1,154 +0,0 @@ -HAYES ESP DRIVER VERSION 2.1 - -A big thanks to the people at Hayes, especially Alan Adamson. Their support -has enabled me to provide enhancements to the driver. - -Please report your experiences with this driver to me (arobinso@nyx.net). I -am looking for both positive and negative feedback. - -*** IMPORTANT CHANGES FOR 2.1 *** -Support for PIO mode. Five situations will cause PIO mode to be used: -1) A multiport card is detected. PIO mode will always be used. (8 port cards -do not support DMA). -2) The DMA channel is set to an invalid value (anything other than 1 or 3). -3) The DMA buffer/channel could not be allocated. The port will revert to PIO -mode until it is reopened. -4) Less than a specified number of bytes need to be transferred to/from the -FIFOs. PIO mode will be used for that transfer only. -5) A port needs to do a DMA transfer and another port is already using the -DMA channel. PIO mode will be used for that transfer only. - -Since the Hayes ESP seems to conflict with other cards (notably sound cards) -when using DMA, DMA is turned off by default. To use DMA, it must be turned -on explicitly, either with the "dma=" option described below or with -setserial. A multiport card can be forced into DMA mode by using setserial; -however, most multiport cards don't support DMA. - -The latest version of setserial allows the enhanced configuration of the ESP -card to be viewed and modified. -*** - -This package contains the files needed to compile a module to support the Hayes -ESP card. The drivers are basically a modified version of the serial drivers. - -Features: - -- Uses the enhanced mode of the ESP card, allowing a wider range of - interrupts and features than compatibility mode -- Uses DMA and 16 bit PIO mode to transfer data to and from the ESP's FIFOs, - reducing CPU load -- Supports primary and secondary ports - - -If the driver is compiled as a module, the IRQs to use can be specified by -using the irq= option. The format is: - -irq=[0x100],[0x140],[0x180],[0x200],[0x240],[0x280],[0x300],[0x380] - -The address in brackets is the base address of the card. The IRQ of -nonexistent cards can be set to 0. If an IRQ of a card that does exist is set -to 0, the driver will attempt to guess at the correct IRQ. For example, to set -the IRQ of the card at address 0x300 to 12, the insmod command would be: - -insmod esp irq=0,0,0,0,0,0,12,0 - -The custom divisor can be set by using the divisor= option. The format is the -same as for the irq= option. Each divisor value is a series of hex digits, -with each digit representing the divisor to use for a corresponding port. The -divisor value is constructed RIGHT TO LEFT. Specifying a nonzero divisor value -will automatically set the spd_cust flag. To calculate the divisor to use for -a certain baud rate, divide the port's base baud (generally 921600) by the -desired rate. For example, to set the divisor of the primary port at 0x300 to -4 and the divisor of the secondary port at 0x308 to 8, the insmod command would -be: - -insmod esp divisor=0,0,0,0,0,0,0x84,0 - -The dma= option can be used to set the DMA channel. The channel can be either -1 or 3. Specifying any other value will force the driver to use PIO mode. -For example, to set the DMA channel to 3, the insmod command would be: - -insmod esp dma=3 - -The rx_trigger= and tx_trigger= options can be used to set the FIFO trigger -levels. They specify when the ESP card should send an interrupt. Larger -values will decrease the number of interrupts; however, a value too high may -result in data loss. Valid values are 1 through 1023, with 768 being the -default. For example, to set the receive trigger level to 512 bytes and the -transmit trigger level to 700 bytes, the insmod command would be: - -insmod esp rx_trigger=512 tx_trigger=700 - -The flow_off= and flow_on= options can be used to set the hardware flow off/ -flow on levels. The flow on level must be lower than the flow off level, and -the flow off level should be higher than rx_trigger. Valid values are 1 -through 1023, with 1016 being the default flow off level and 944 being the -default flow on level. For example, to set the flow off level to 1000 bytes -and the flow on level to 935 bytes, the insmod command would be: - -insmod esp flow_off=1000 flow_on=935 - -The rx_timeout= option can be used to set the receive timeout value. This -value indicates how long after receiving the last character that the ESP card -should wait before signalling an interrupt. Valid values are 0 though 255, -with 128 being the default. A value too high will increase latency, and a -value too low will cause unnecessary interrupts. For example, to set the -receive timeout to 255, the insmod command would be: - -insmod esp rx_timeout=255 - -The pio_threshold= option sets the threshold (in number of characters) for -using PIO mode instead of DMA mode. For example, if this value is 32, -transfers of 32 bytes or less will always use PIO mode. - -insmod esp pio_threshold=32 - -Multiple options can be listed on the insmod command line by separating each -option with a space. For example: - -insmod esp dma=3 trigger=512 - -The esp module can be automatically loaded when needed. To cause this to -happen, add the following lines to /etc/modprobe.conf (replacing the last line -with options for your configuration): - -alias char-major-57 esp -alias char-major-58 esp -options esp irq=0,0,0,0,0,0,3,0 divisor=0,0,0,0,0,0,0x4,0 - -You may also need to run 'depmod -a'. - -Devices must be created manually. To create the devices, note the output from -the module after it is inserted. The output will appear in the location where -kernel messages usually appear (usually /var/adm/messages). Create two devices -for each 'tty' mentioned, one with major of 57 and the other with major of 58. -The minor number should be the same as the tty number reported. The commands -would be (replace ? with the tty number): - -mknod /dev/ttyP? c 57 ? -mknod /dev/cup? c 58 ? - -For example, if the following line appears: - -Oct 24 18:17:23 techno kernel: ttyP8 at 0x0140 (irq = 3) is an ESP primary port - -...two devices should be created: - -mknod /dev/ttyP8 c 57 8 -mknod /dev/cup8 c 58 8 - -You may need to set the permissions on the devices: - -chmod 666 /dev/ttyP* -chmod 666 /dev/cup* - -The ESP module and the serial module should not conflict (they can be used at -the same time). After the ESP module has been loaded the ports on the ESP card -will no longer be accessible by the serial driver. - -If I/O errors are experienced when accessing the port, check for IRQ and DMA -conflicts ('cat /proc/interrupts' and 'cat /proc/dma' for a list of IRQs and -DMAs currently in use). - -Enjoy! -Andrew J. Robinson <arobinso@nyx.net> diff --git a/Documentation/serial/tty.txt b/Documentation/serial/tty.txt index 8e65c44..5e5349a 100644 --- a/Documentation/serial/tty.txt +++ b/Documentation/serial/tty.txt @@ -42,7 +42,8 @@ TTY side interfaces: open() - Called when the line discipline is attached to the terminal. No other call into the line discipline for this tty will occur until it - completes successfully. Can sleep. + completes successfully. Returning an error will + prevent the ldisc from being attached. Can sleep. close() - This is called on a terminal when the line discipline is being unplugged. At the point of @@ -52,7 +53,7 @@ close() - This is called on a terminal when the line hangup() - Called when the tty line is hung up. The line discipline should cease I/O to the tty. No further calls into the ldisc code will occur. - Can sleep. + The return value is ignored. Can sleep. write() - A process is writing data through the line discipline. Multiple write calls are serialized @@ -83,6 +84,10 @@ ioctl() - Called when an ioctl is handed to the tty layer that might be for the ldisc. Multiple ioctl calls may occur in parallel. May sleep. +compat_ioctl() - Called when a 32 bit ioctl is handed to the tty layer + that might be for the ldisc. Multiple ioctl calls + may occur in parallel. May sleep. + Driver Side Interfaces: receive_buf() - Hand buffers of bytes from the driver to the ldisc diff --git a/Documentation/spinlocks.txt b/Documentation/spinlocks.txt index 619699d..178c831 100644 --- a/Documentation/spinlocks.txt +++ b/Documentation/spinlocks.txt @@ -1,73 +1,8 @@ -SPIN_LOCK_UNLOCKED and RW_LOCK_UNLOCKED defeat lockdep state tracking and -are hence deprecated. +Lesson 1: Spin locks -Please use DEFINE_SPINLOCK()/DEFINE_RWLOCK() or -__SPIN_LOCK_UNLOCKED()/__RW_LOCK_UNLOCKED() as appropriate for static -initialization. - -Most of the time, you can simply turn: - - static spinlock_t xxx_lock = SPIN_LOCK_UNLOCKED; - -into: - - static DEFINE_SPINLOCK(xxx_lock); - -Static structure member variables go from: - - struct foo bar { - .lock = SPIN_LOCK_UNLOCKED; - }; - -to: - - struct foo bar { - .lock = __SPIN_LOCK_UNLOCKED(bar.lock); - }; - -Declaration of static rw_locks undergo a similar transformation. - -Dynamic initialization, when necessary, may be performed as -demonstrated below. - - spinlock_t xxx_lock; - rwlock_t xxx_rw_lock; - - static int __init xxx_init(void) - { - spin_lock_init(&xxx_lock); - rwlock_init(&xxx_rw_lock); - ... - } - - module_init(xxx_init); - -The following discussion is still valid, however, with the dynamic -initialization of spinlocks or with DEFINE_SPINLOCK, etc., used -instead of SPIN_LOCK_UNLOCKED. - ------------------------ - -On Fri, 2 Jan 1998, Doug Ledford wrote: -> -> I'm working on making the aic7xxx driver more SMP friendly (as well as -> importing the latest FreeBSD sequencer code to have 7895 support) and wanted -> to get some info from you. The goal here is to make the various routines -> SMP safe as well as UP safe during interrupts and other manipulating -> routines. So far, I've added a spin_lock variable to things like my queue -> structs. Now, from what I recall, there are some spin lock functions I can -> use to lock these spin locks from other use as opposed to a (nasty) -> save_flags(); cli(); stuff; restore_flags(); construct. Where do I find -> these routines and go about making use of them? Do they only lock on a -> per-processor basis or can they also lock say an interrupt routine from -> mucking with a queue if the queue routine was manipulating it when the -> interrupt occurred, or should I still use a cli(); based construct on that -> one? - -See <asm/spinlock.h>. The basic version is: - - spinlock_t xxx_lock = SPIN_LOCK_UNLOCKED; +The most basic primitive for locking is spinlock. +static DEFINE_SPINLOCK(xxx_lock); unsigned long flags; @@ -75,13 +10,11 @@ See <asm/spinlock.h>. The basic version is: ... critical section here .. spin_unlock_irqrestore(&xxx_lock, flags); -and the above is always safe. It will disable interrupts _locally_, but the +The above is always safe. It will disable interrupts _locally_, but the spinlock itself will guarantee the global lock, so it will guarantee that there is only one thread-of-control within the region(s) protected by that -lock. - -Note that it works well even under UP - the above sequence under UP -essentially is just the same as doing a +lock. This works well even under UP. The above sequence under UP +essentially is just the same as doing unsigned long flags; @@ -91,15 +24,13 @@ essentially is just the same as doing a so the code does _not_ need to worry about UP vs SMP issues: the spinlocks work correctly under both (and spinlocks are actually more efficient on -architectures that allow doing the "save_flags + cli" in one go because I -don't export that interface normally). +architectures that allow doing the "save_flags + cli" in one operation). + + NOTE! Implications of spin_locks for memory are further described in: -NOTE NOTE NOTE! The reason the spinlock is so much faster than a global -interrupt lock under SMP is exactly because it disables interrupts only on -the local CPU. The spin-lock is safe only when you _also_ use the lock -itself to do locking across CPU's, which implies that EVERYTHING that -touches a shared variable has to agree about the spinlock they want to -use. + Documentation/memory-barriers.txt + (5) LOCK operations. + (6) UNLOCK operations. The above is usually pretty simple (you usually need and want only one spinlock for most things - using more than one spinlock can make things a @@ -120,20 +51,24 @@ and another sequence that does then they are NOT mutually exclusive, and the critical regions can happen at the same time on two different CPU's. That's fine per se, but the critical regions had better be critical for different things (ie they -can't stomp on each other). +can't stomp on each other). The above is a problem mainly if you end up mixing code - for example the routines in ll_rw_block() tend to use cli/sti to protect the atomicity of their actions, and if a driver uses spinlocks instead then you should -think about issues like the above.. +think about issues like the above. This is really the only really hard part about spinlocks: once you start using spinlocks they tend to expand to areas you might not have noticed before, because you have to make sure the spinlocks correctly protect the shared data structures _everywhere_ they are used. The spinlocks are most -easily added to places that are completely independent of other code (ie -internal driver data structures that nobody else ever touches, for -example). +easily added to places that are completely independent of other code (for +example, internal driver data structures that nobody else ever touches). + + NOTE! The spin-lock is safe only when you _also_ use the lock itself + to do locking across CPU's, which implies that EVERYTHING that + touches a shared variable has to agree about the spinlock they want + to use. ---- @@ -141,13 +76,17 @@ Lesson 2: reader-writer spinlocks. If your data accesses have a very natural pattern where you usually tend to mostly read from the shared variables, the reader-writer locks -(rw_lock) versions of the spinlocks are often nicer. They allow multiple +(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple readers to be in the same critical region at once, but if somebody wants -to change the variables it has to get an exclusive write lock. The -routines look the same as above: +to change the variables it has to get an exclusive write lock. - rwlock_t xxx_lock = RW_LOCK_UNLOCKED; + NOTE! reader-writer locks require more atomic memory operations than + simple spinlocks. Unless the reader critical section is long, you + are better off just using spinlocks. +The routines look the same as above: + + rwlock_t xxx_lock = RW_LOCK_UNLOCKED; unsigned long flags; @@ -159,18 +98,21 @@ routines look the same as above: .. read and write exclusive access to the info ... write_unlock_irqrestore(&xxx_lock, flags); -The above kind of lock is useful for complex data structures like linked -lists etc, especially when you know that most of the work is to just -traverse the list searching for entries without changing the list itself, -for example. Then you can use the read lock for that kind of list -traversal, which allows many concurrent readers. Anything that _changes_ -the list will have to get the write lock. +The above kind of lock may be useful for complex data structures like +linked lists, especially searching for entries without changing the list +itself. The read lock allows many concurrent readers. Anything that +_changes_ the list will have to get the write lock. + + NOTE! RCU is better for list traversal, but requires careful + attention to design detail (see Documentation/RCU/listRCU.txt). -Note: you cannot "upgrade" a read-lock to a write-lock, so if you at _any_ +Also, you cannot "upgrade" a read-lock to a write-lock, so if you at _any_ time need to do any changes (even if you don't do it every time), you have -to get the write-lock at the very beginning. I could fairly easily add a -primitive to create a "upgradeable" read-lock, but it hasn't been an issue -yet. Tell me if you'd want one. +to get the write-lock at the very beginning. + + NOTE! We are working hard to remove reader-writer spinlocks in most + cases, so please don't add a new one without consensus. (Instead, see + Documentation/RCU/rcu.txt for complete information.) ---- @@ -233,4 +175,46 @@ indeed), while write-locks need to protect themselves against interrupts. Linus +---- + +Reference information: + +For dynamic initialization, use spin_lock_init() or rwlock_init() as +appropriate: + + spinlock_t xxx_lock; + rwlock_t xxx_rw_lock; + + static int __init xxx_init(void) + { + spin_lock_init(&xxx_lock); + rwlock_init(&xxx_rw_lock); + ... + } + + module_init(xxx_init); + +For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or +__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate. + +SPIN_LOCK_UNLOCKED and RW_LOCK_UNLOCKED are deprecated. These interfere +with lockdep state tracking. + +Most of the time, you can simply turn: + static spinlock_t xxx_lock = SPIN_LOCK_UNLOCKED; +into: + static DEFINE_SPINLOCK(xxx_lock); + +Static structure member variables go from: + + struct foo bar { + .lock = SPIN_LOCK_UNLOCKED; + }; + +to: + struct foo bar { + .lock = __SPIN_LOCK_UNLOCKED(bar.lock); + }; + +Declaration of static rw_locks undergo a similar transformation. diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 8f7a0e7..3894eaa 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -19,6 +19,8 @@ Currently, these files might (depending on your configuration) show up in /proc/sys/kernel: - acpi_video_flags - acct +- bootloader_type [ X86 only ] +- bootloader_version [ X86 only ] - callhome [ S390 only ] - auto_msgmni - core_pattern @@ -93,6 +95,35 @@ valid for 30 seconds. ============================================================== +bootloader_type: + +x86 bootloader identification + +This gives the bootloader type number as indicated by the bootloader, +shifted left by 4, and OR'd with the low four bits of the bootloader +version. The reason for this encoding is that this used to match the +type_of_loader field in the kernel header; the encoding is kept for +backwards compatibility. That is, if the full bootloader type number +is 0x15 and the full version number is 0x234, this file will contain +the value 340 = 0x154. + +See the type_of_loader and ext_loader_type fields in +Documentation/x86/boot.txt for additional information. + +============================================================== + +bootloader_version: + +x86 bootloader version + +The complete bootloader version number. In the example above, this +file will contain the value 564 = 0x234. + +See the type_of_loader and ext_loader_ver fields in +Documentation/x86/boot.txt for additional information. + +============================================================== + callhome: Controls the kernel's callhome behavior in case of a kernel panic. diff --git a/Documentation/usb/power-management.txt b/Documentation/usb/power-management.txt index ad64261..c7c1dc2 100644 --- a/Documentation/usb/power-management.txt +++ b/Documentation/usb/power-management.txt @@ -2,7 +2,7 @@ Alan Stern <stern@rowland.harvard.edu> - October 5, 2007 + November 10, 2009 @@ -123,9 +123,9 @@ relevant attribute files are: wakeup, level, and autosuspend. power/level - This file contains one of three words: "on", "auto", - or "suspend". You can write those words to the file - to change the device's setting. + This file contains one of two words: "on" or "auto". + You can write those words to the file to change the + device's setting. "on" means that the device should be resumed and autosuspend is not allowed. (Of course, system @@ -134,10 +134,10 @@ relevant attribute files are: wakeup, level, and autosuspend. "auto" is the normal state in which the kernel is allowed to autosuspend and autoresume the device. - "suspend" means that the device should remain - suspended, and autoresume is not allowed. (But remote - wakeup may still be allowed, since it is controlled - separately by the power/wakeup attribute.) + (In kernels up to 2.6.32, you could also specify + "suspend", meaning that the device should remain + suspended and autoresume was not allowed. This + setting is no longer supported.) power/autosuspend @@ -313,13 +313,14 @@ three of the methods listed above. In addition, a driver indicates that it supports autosuspend by setting the .supports_autosuspend flag in its usb_driver structure. It is then responsible for informing the USB core whenever one of its interfaces becomes busy or idle. The -driver does so by calling these five functions: +driver does so by calling these six functions: int usb_autopm_get_interface(struct usb_interface *intf); void usb_autopm_put_interface(struct usb_interface *intf); - int usb_autopm_set_interface(struct usb_interface *intf); int usb_autopm_get_interface_async(struct usb_interface *intf); void usb_autopm_put_interface_async(struct usb_interface *intf); + void usb_autopm_get_interface_no_resume(struct usb_interface *intf); + void usb_autopm_put_interface_no_suspend(struct usb_interface *intf); The functions work by maintaining a counter in the usb_interface structure. When intf->pm_usage_count is > 0 then the interface is @@ -331,11 +332,13 @@ considered to be idle, and the kernel may autosuspend the device. associated with the device itself rather than any of its interfaces. This field is used only by the USB core.) -The driver owns intf->pm_usage_count; it can modify the value however -and whenever it likes. A nice aspect of the non-async usb_autopm_* -routines is that the changes they make are protected by the usb_device -structure's PM mutex (udev->pm_mutex); however drivers may change -pm_usage_count without holding the mutex. Drivers using the async +Drivers must not modify intf->pm_usage_count directly; its value +should be changed only be using the functions listed above. Drivers +are responsible for insuring that the overall change to pm_usage_count +during their lifetime balances out to 0 (it may be necessary for the +disconnect method to call usb_autopm_put_interface() one or more times +to fulfill this requirement). The first two routines use the PM mutex +in struct usb_device for mutual exclusion; drivers using the async routines are responsible for their own synchronization and mutual exclusion. @@ -347,11 +350,6 @@ exclusion. attempts an autosuspend if the new value is <= 0 and the device isn't suspended. - usb_autopm_set_interface() leaves pm_usage_count alone. - It attempts an autoresume if the value is > 0 and the device - is suspended, and it attempts an autosuspend if the value is - <= 0 and the device isn't suspended. - usb_autopm_get_interface_async() and usb_autopm_put_interface_async() do almost the same things as their non-async counterparts. The differences are: they do @@ -360,13 +358,11 @@ exclusion. such as an URB's completion handler, but when they return the device will not generally not yet be in the desired state. -There also are a couple of utility routines drivers can use: - - usb_autopm_enable() sets pm_usage_cnt to 0 and then calls - usb_autopm_set_interface(), which will attempt an autosuspend. - - usb_autopm_disable() sets pm_usage_cnt to 1 and then calls - usb_autopm_set_interface(), which will attempt an autoresume. + usb_autopm_get_interface_no_resume() and + usb_autopm_put_interface_no_suspend() merely increment or + decrement the pm_usage_count value; they do not attempt to + carry out an autoresume or an autosuspend. Hence they can be + called in an atomic context. The conventional usage pattern is that a driver calls usb_autopm_get_interface() in its open routine and @@ -400,11 +396,11 @@ though, setting this flag won't cause the kernel to autoresume it. Normally a driver would set this flag in its probe method, at which time the device is guaranteed not to be autosuspended.) -The usb_autopm_* routines have to run in a sleepable process context; -they must not be called from an interrupt handler or while holding a -spinlock. In fact, the entire autosuspend mechanism is not well geared -toward interrupt-driven operation. However there is one thing a -driver can do in an interrupt handler: +The synchronous usb_autopm_* routines have to run in a sleepable +process context; they must not be called from an interrupt handler or +while holding a spinlock. In fact, the entire autosuspend mechanism +is not well geared toward interrupt-driven operation. However there +is one thing a driver can do in an interrupt handler: usb_mark_last_busy(struct usb_device *udev); @@ -423,15 +419,16 @@ an URB had completed too recently. External suspend calls should never be allowed to fail in this way, only autosuspend calls. The driver can tell them apart by checking -udev->auto_pm; this flag will be set to 1 for internal PM events -(autosuspend or autoresume) and 0 for external PM events. +the PM_EVENT_AUTO bit in the message.event argument to the suspend +method; this bit will be set for internal PM events (autosuspend) and +clear for external PM events. Many of the ingredients in the autosuspend framework are oriented towards interfaces: The usb_interface structure contains the pm_usage_cnt field, and the usb_autopm_* routines take an interface pointer as their argument. But somewhat confusingly, a few of the -pieces (usb_mark_last_busy() and udev->auto_pm) use the usb_device -structure instead. Drivers need to keep this straight; they can call +pieces (i.e., usb_mark_last_busy()) use the usb_device structure +instead. Drivers need to keep this straight; they can call interface_to_usbdev() to find the device structure for a given interface. diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index 82a7bd1..bc31636 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt @@ -11,23 +11,21 @@ This optimization is more critical now as bigger and bigger physical memories (several GBs) are more readily available. Users can use the huge page support in Linux kernel by either using the mmap -system call or standard SYSv shared memory system calls (shmget, shmat). +system call or standard SYSV shared memory system calls (shmget, shmat). First the Linux kernel needs to be built with the CONFIG_HUGETLBFS (present under "File systems") and CONFIG_HUGETLB_PAGE (selected automatically when CONFIG_HUGETLBFS is selected) configuration options. -The kernel built with huge page support should show the number of configured -huge pages in the system by running the "cat /proc/meminfo" command. +The /proc/meminfo file provides information about the total number of +persistent hugetlb pages in the kernel's huge page pool. It also displays +information about the number of free, reserved and surplus huge pages and the +default huge page size. The huge page size is needed for generating the +proper alignment and size of the arguments to system calls that map huge page +regions. -/proc/meminfo also provides information about the total number of hugetlb -pages configured in the kernel. It also displays information about the -number of free hugetlb pages at any time. It also displays information about -the configured huge page size - this is needed for generating the proper -alignment and size of the arguments to the above system calls. - -The output of "cat /proc/meminfo" will have lines like: +The output of "cat /proc/meminfo" will include lines like: ..... HugePages_Total: vvv @@ -53,59 +51,63 @@ HugePages_Surp is short for "surplus," and is the number of huge pages in /proc/filesystems should also show a filesystem of type "hugetlbfs" configured in the kernel. -/proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb -pages in the kernel. Super user can dynamically request more (or free some -pre-configured) huge pages. -The allocation (or deallocation) of hugetlb pages is possible only if there are -enough physically contiguous free pages in system (freeing of huge pages is -possible only if there are enough hugetlb pages free that can be transferred -back to regular memory pool). +/proc/sys/vm/nr_hugepages indicates the current number of "persistent" huge +pages in the kernel's huge page pool. "Persistent" huge pages will be +returned to the huge page pool when freed by a task. A user with root +privileges can dynamically allocate more or free some persistent huge pages +by increasing or decreasing the value of 'nr_hugepages'. -Pages that are used as hugetlb pages are reserved inside the kernel and cannot -be used for other purposes. +Pages that are used as huge pages are reserved inside the kernel and cannot +be used for other purposes. Huge pages cannot be swapped out under +memory pressure. -Once the kernel with Hugetlb page support is built and running, a user can -use either the mmap system call or shared memory system calls to start using -the huge pages. It is required that the system administrator preallocate -enough memory for huge page purposes. +Once a number of huge pages have been pre-allocated to the kernel huge page +pool, a user with appropriate privilege can use either the mmap system call +or shared memory system calls to use the huge pages. See the discussion of +Using Huge Pages, below. -The administrator can preallocate huge pages on the kernel boot command line by -specifying the "hugepages=N" parameter, where 'N' = the number of huge pages -requested. This is the most reliable method for preallocating huge pages as -memory has not yet become fragmented. +The administrator can allocate persistent huge pages on the kernel boot +command line by specifying the "hugepages=N" parameter, where 'N' = the +number of huge pages requested. This is the most reliable method of +allocating huge pages as memory has not yet become fragmented. -Some platforms support multiple huge page sizes. To preallocate huge pages +Some platforms support multiple huge page sizes. To allocate huge pages of a specific size, one must preceed the huge pages boot command parameters with a huge page size selection parameter "hugepagesz=<size>". <size> must be specified in bytes with optional scale suffix [kKmMgG]. The default huge page size may be selected with the "default_hugepagesz=<size>" boot parameter. -/proc/sys/vm/nr_hugepages indicates the current number of configured [default -size] hugetlb pages in the kernel. Super user can dynamically request more -(or free some pre-configured) huge pages. - -Use the following command to dynamically allocate/deallocate default sized -huge pages: +When multiple huge page sizes are supported, /proc/sys/vm/nr_hugepages +indicates the current number of pre-allocated huge pages of the default size. +Thus, one can use the following command to dynamically allocate/deallocate +default sized persistent huge pages: echo 20 > /proc/sys/vm/nr_hugepages -This command will try to configure 20 default sized huge pages in the system. +This command will try to adjust the number of default sized huge pages in the +huge page pool to 20, allocating or freeing huge pages, as required. + On a NUMA platform, the kernel will attempt to distribute the huge page pool -over the all on-line nodes. These huge pages, allocated when nr_hugepages -is increased, are called "persistent huge pages". +over all the set of allowed nodes specified by the NUMA memory policy of the +task that modifies nr_hugepages. The default for the allowed nodes--when the +task has default memory policy--is all on-line nodes with memory. Allowed +nodes with insufficient available, contiguous memory for a huge page will be +silently skipped when allocating persistent huge pages. See the discussion +below of the interaction of task memory policy, cpusets and per node attributes +with the allocation and freeing of persistent huge pages. The success or failure of huge page allocation depends on the amount of -physically contiguous memory that is preset in system at the time of the +physically contiguous memory that is present in system at the time of the allocation attempt. If the kernel is unable to allocate huge pages from some nodes in a NUMA system, it will attempt to make up the difference by allocating extra pages on other nodes with sufficient available contiguous memory, if any. -System administrators may want to put this command in one of the local rc init -files. This will enable the kernel to request huge pages early in the boot -process when the possibility of getting physical contiguous pages is still -very high. Administrators can verify the number of huge pages actually -allocated by checking the sysctl or meminfo. To check the per node +System administrators may want to put this command in one of the local rc +init files. This will enable the kernel to allocate huge pages early in +the boot process when the possibility of getting physical contiguous pages +is still very high. Administrators can verify the number of huge pages +actually allocated by checking the sysctl or meminfo. To check the per node distribution of huge pages in a NUMA system, use: cat /sys/devices/system/node/node*/meminfo | fgrep Huge @@ -113,45 +115,47 @@ distribution of huge pages in a NUMA system, use: /proc/sys/vm/nr_overcommit_hugepages specifies how large the pool of huge pages can grow, if more huge pages than /proc/sys/vm/nr_hugepages are requested by applications. Writing any non-zero value into this file -indicates that the hugetlb subsystem is allowed to try to obtain "surplus" -huge pages from the buddy allocator, when the normal pool is exhausted. As -these surplus huge pages go out of use, they are freed back to the buddy -allocator. +indicates that the hugetlb subsystem is allowed to try to obtain that +number of "surplus" huge pages from the kernel's normal page pool, when the +persistent huge page pool is exhausted. As these surplus huge pages become +unused, they are freed back to the kernel's normal page pool. -When increasing the huge page pool size via nr_hugepages, any surplus +When increasing the huge page pool size via nr_hugepages, any existing surplus pages will first be promoted to persistent huge pages. Then, additional huge pages will be allocated, if necessary and if possible, to fulfill -the new huge page pool size. +the new persistent huge page pool size. -The administrator may shrink the pool of preallocated huge pages for +The administrator may shrink the pool of persistent huge pages for the default huge page size by setting the nr_hugepages sysctl to a smaller value. The kernel will attempt to balance the freeing of huge pages -across all on-line nodes. Any free huge pages on the selected nodes will -be freed back to the buddy allocator. - -Caveat: Shrinking the pool via nr_hugepages such that it becomes less -than the number of huge pages in use will convert the balance to surplus -huge pages even if it would exceed the overcommit value. As long as -this condition holds, however, no more surplus huge pages will be -allowed on the system until one of the two sysctls are increased -sufficiently, or the surplus huge pages go out of use and are freed. +across all nodes in the memory policy of the task modifying nr_hugepages. +Any free huge pages on the selected nodes will be freed back to the kernel's +normal page pool. + +Caveat: Shrinking the persistent huge page pool via nr_hugepages such that +it becomes less than the number of huge pages in use will convert the balance +of the in-use huge pages to surplus huge pages. This will occur even if +the number of surplus pages it would exceed the overcommit value. As long as +this condition holds--that is, until nr_hugepages+nr_overcommit_hugepages is +increased sufficiently, or the surplus huge pages go out of use and are freed-- +no more surplus huge pages will be allowed to be allocated. With support for multiple huge page pools at run-time available, much of -the huge page userspace interface has been duplicated in sysfs. The above -information applies to the default huge page size which will be -controlled by the /proc interfaces for backwards compatibility. The root -huge page control directory in sysfs is: +the huge page userspace interface in /proc/sys/vm has been duplicated in sysfs. +The /proc interfaces discussed above have been retained for backwards +compatibility. The root huge page control directory in sysfs is: /sys/kernel/mm/hugepages For each huge page size supported by the running kernel, a subdirectory -will exist, of the form +will exist, of the form: hugepages-${size}kB Inside each of these directories, the same set of files will exist: nr_hugepages + nr_hugepages_mempolicy nr_overcommit_hugepages free_hugepages resv_hugepages @@ -159,6 +163,102 @@ Inside each of these directories, the same set of files will exist: which function as described above for the default huge page-sized case. + +Interaction of Task Memory Policy with Huge Page Allocation/Freeing + +Whether huge pages are allocated and freed via the /proc interface or +the /sysfs interface using the nr_hugepages_mempolicy attribute, the NUMA +nodes from which huge pages are allocated or freed are controlled by the +NUMA memory policy of the task that modifies the nr_hugepages_mempolicy +sysctl or attribute. When the nr_hugepages attribute is used, mempolicy +is ignored. + +The recommended method to allocate or free huge pages to/from the kernel +huge page pool, using the nr_hugepages example above, is: + + numactl --interleave <node-list> echo 20 \ + >/proc/sys/vm/nr_hugepages_mempolicy + +or, more succinctly: + + numactl -m <node-list> echo 20 >/proc/sys/vm/nr_hugepages_mempolicy + +This will allocate or free abs(20 - nr_hugepages) to or from the nodes +specified in <node-list>, depending on whether number of persistent huge pages +is initially less than or greater than 20, respectively. No huge pages will be +allocated nor freed on any node not included in the specified <node-list>. + +When adjusting the persistent hugepage count via nr_hugepages_mempolicy, any +memory policy mode--bind, preferred, local or interleave--may be used. The +resulting effect on persistent huge page allocation is as follows: + +1) Regardless of mempolicy mode [see Documentation/vm/numa_memory_policy.txt], + persistent huge pages will be distributed across the node or nodes + specified in the mempolicy as if "interleave" had been specified. + However, if a node in the policy does not contain sufficient contiguous + memory for a huge page, the allocation will not "fallback" to the nearest + neighbor node with sufficient contiguous memory. To do this would cause + undesirable imbalance in the distribution of the huge page pool, or + possibly, allocation of persistent huge pages on nodes not allowed by + the task's memory policy. + +2) One or more nodes may be specified with the bind or interleave policy. + If more than one node is specified with the preferred policy, only the + lowest numeric id will be used. Local policy will select the node where + the task is running at the time the nodes_allowed mask is constructed. + For local policy to be deterministic, the task must be bound to a cpu or + cpus in a single node. Otherwise, the task could be migrated to some + other node at any time after launch and the resulting node will be + indeterminate. Thus, local policy is not very useful for this purpose. + Any of the other mempolicy modes may be used to specify a single node. + +3) The nodes allowed mask will be derived from any non-default task mempolicy, + whether this policy was set explicitly by the task itself or one of its + ancestors, such as numactl. This means that if the task is invoked from a + shell with non-default policy, that policy will be used. One can specify a + node list of "all" with numactl --interleave or --membind [-m] to achieve + interleaving over all nodes in the system or cpuset. + +4) Any task mempolicy specifed--e.g., using numactl--will be constrained by + the resource limits of any cpuset in which the task runs. Thus, there will + be no way for a task with non-default policy running in a cpuset with a + subset of the system nodes to allocate huge pages outside the cpuset + without first moving to a cpuset that contains all of the desired nodes. + +5) Boot-time huge page allocation attempts to distribute the requested number + of huge pages over all on-lines nodes with memory. + +Per Node Hugepages Attributes + +A subset of the contents of the root huge page control directory in sysfs, +described above, will be replicated under each the system device of each +NUMA node with memory in: + + /sys/devices/system/node/node[0-9]*/hugepages/ + +Under this directory, the subdirectory for each supported huge page size +contains the following attribute files: + + nr_hugepages + free_hugepages + surplus_hugepages + +The free_' and surplus_' attribute files are read-only. They return the number +of free and surplus [overcommitted] huge pages, respectively, on the parent +node. + +The nr_hugepages attribute returns the total number of huge pages on the +specified node. When this attribute is written, the number of persistent huge +pages on the parent node will be adjusted to the specified value, if sufficient +resources exist, regardless of the task's mempolicy or cpuset constraints. + +Note that the number of overcommit and reserve pages remain global quantities, +as we don't know until fault time, when the faulting task's mempolicy is +applied, from which node the huge page allocation will be attempted. + + +Using Huge Pages + If the user applications are going to request huge pages using mmap system call, then it is required that system administrator mount a file system of type hugetlbfs: @@ -206,9 +306,11 @@ map_hugetlb.c. * requesting huge pages. * * For the ia64 architecture, the Linux kernel reserves Region number 4 for - * huge pages. That means the addresses starting with 0x800000... will need - * to be specified. Specifying a fixed address is not required on ppc64, - * i386 or x86_64. + * huge pages. That means that if one requires a fixed address, a huge page + * aligned address starting with 0x800000... will be required. If a fixed + * address is not required, the kernel will select an address in the proper + * range. + * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. * * Note: The default shared memory limit is quite low on many kernels, * you may need to increase it via: @@ -237,14 +339,8 @@ map_hugetlb.c. #define dprintf(x) printf(x) -/* Only ia64 requires this */ -#ifdef __ia64__ -#define ADDR (void *)(0x8000000000000000UL) -#define SHMAT_FLAGS (SHM_RND) -#else -#define ADDR (void *)(0x0UL) +#define ADDR (void *)(0x0UL) /* let kernel choose address */ #define SHMAT_FLAGS (0) -#endif int main(void) { @@ -302,10 +398,12 @@ int main(void) * example, the app is requesting memory of size 256MB that is backed by * huge pages. * - * For ia64 architecture, Linux kernel reserves Region number 4 for huge pages. - * That means the addresses starting with 0x800000... will need to be - * specified. Specifying a fixed address is not required on ppc64, i386 - * or x86_64. + * For the ia64 architecture, the Linux kernel reserves Region number 4 for + * huge pages. That means that if one requires a fixed address, a huge page + * aligned address starting with 0x800000... will be required. If a fixed + * address is not required, the kernel will select an address in the proper + * range. + * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. */ #include <stdlib.h> #include <stdio.h> @@ -317,14 +415,8 @@ int main(void) #define LENGTH (256UL*1024*1024) #define PROTECTION (PROT_READ | PROT_WRITE) -/* Only ia64 requires this */ -#ifdef __ia64__ -#define ADDR (void *)(0x8000000000000000UL) -#define FLAGS (MAP_SHARED | MAP_FIXED) -#else -#define ADDR (void *)(0x0UL) +#define ADDR (void *)(0x0UL) /* let kernel choose address */ #define FLAGS (MAP_SHARED) -#endif void check_bytes(char *addr) { diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt index 262d8e6..b392e49 100644 --- a/Documentation/vm/ksm.txt +++ b/Documentation/vm/ksm.txt @@ -16,9 +16,9 @@ by sharing the data common between them. But it can be useful to any application which generates many instances of the same data. KSM only merges anonymous (private) pages, never pagecache (file) pages. -KSM's merged pages are at present locked into kernel memory for as long -as they are shared: so cannot be swapped out like the user pages they -replace (but swapping KSM pages should follow soon in a later release). +KSM's merged pages were originally locked into kernel memory, but can now +be swapped out just like other user pages (but sharing is broken when they +are swapped back in: ksmd must rediscover their identity and merge again). KSM only operates on those areas of address space which an application has advised to be likely candidates for merging, by using the madvise(2) @@ -44,20 +44,12 @@ includes unmapped gaps (though working on the intervening mapped areas), and might fail with EAGAIN if not enough memory for internal structures. Applications should be considerate in their use of MADV_MERGEABLE, -restricting its use to areas likely to benefit. KSM's scans may use -a lot of processing power, and its kernel-resident pages are a limited -resource. Some installations will disable KSM for these reasons. +restricting its use to areas likely to benefit. KSM's scans may use a lot +of processing power: some installations will disable KSM for that reason. The KSM daemon is controlled by sysfs files in /sys/kernel/mm/ksm/, readable by all but writable only by root: -max_kernel_pages - set to maximum number of kernel pages that KSM may use - e.g. "echo 100000 > /sys/kernel/mm/ksm/max_kernel_pages" - Value 0 imposes no limit on the kernel pages KSM may use; - but note that any process using MADV_MERGEABLE can cause - KSM to allocate these pages, unswappable until it exits. - Default: quarter of memory (chosen to not pin too much) - pages_to_scan - how many present pages to scan before ksmd goes to sleep e.g. "echo 100 > /sys/kernel/mm/ksm/pages_to_scan" Default: 100 (chosen for demonstration purposes) @@ -75,7 +67,7 @@ run - set 0 to stop ksmd from running but keep merged pages, The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/: -pages_shared - how many shared unswappable kernel pages KSM is using +pages_shared - how many shared pages are being used pages_sharing - how many more sites are sharing them i.e. how much saved pages_unshared - how many pages unique but repeatedly checked for merging pages_volatile - how many pages changing too fast to be placed in a tree @@ -87,4 +79,4 @@ pages_volatile embraces several different kinds of activity, but a high proportion there would also indicate poor use of madvise MADV_MERGEABLE. Izik Eidus, -Hugh Dickins, 24 Sept 2009 +Hugh Dickins, 17 Nov 2009 diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index ea44ea5..7a7d9ba 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c @@ -100,7 +100,7 @@ #define BIT(name) (1ULL << KPF_##name) #define BITS_COMPOUND (BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL)) -static char *page_flag_names[] = { +static const char *page_flag_names[] = { [KPF_LOCKED] = "L:locked", [KPF_ERROR] = "E:error", [KPF_REFERENCED] = "R:referenced", @@ -173,7 +173,7 @@ static int kpageflags_fd; static int opt_hwpoison; static int opt_unpoison; -static char *hwpoison_debug_fs = "/debug/hwpoison"; +static const char hwpoison_debug_fs[] = "/debug/hwpoison"; static int hwpoison_inject_fd; static int hwpoison_forget_fd; @@ -560,7 +560,7 @@ static void walk_pfn(unsigned long voffset, { uint64_t buf[KPAGEFLAGS_BATCH]; unsigned long batch; - unsigned long pages; + long pages; unsigned long i; while (count) { @@ -673,30 +673,35 @@ static void usage(void) printf( "page-types [options]\n" -" -r|--raw Raw mode, for kernel developers\n" -" -a|--addr addr-spec Walk a range of pages\n" -" -b|--bits bits-spec Walk pages with specified bits\n" -" -p|--pid pid Walk process address space\n" +" -r|--raw Raw mode, for kernel developers\n" +" -d|--describe flags Describe flags\n" +" -a|--addr addr-spec Walk a range of pages\n" +" -b|--bits bits-spec Walk pages with specified bits\n" +" -p|--pid pid Walk process address space\n" #if 0 /* planned features */ -" -f|--file filename Walk file address space\n" +" -f|--file filename Walk file address space\n" #endif -" -l|--list Show page details in ranges\n" -" -L|--list-each Show page details one by one\n" -" -N|--no-summary Don't show summay info\n" -" -X|--hwpoison hwpoison pages\n" -" -x|--unpoison unpoison pages\n" -" -h|--help Show this usage message\n" +" -l|--list Show page details in ranges\n" +" -L|--list-each Show page details one by one\n" +" -N|--no-summary Don't show summay info\n" +" -X|--hwpoison hwpoison pages\n" +" -x|--unpoison unpoison pages\n" +" -h|--help Show this usage message\n" +"flags:\n" +" 0x10 bitfield format, e.g.\n" +" anon bit-name, e.g.\n" +" 0x10,anon comma-separated list, e.g.\n" "addr-spec:\n" -" N one page at offset N (unit: pages)\n" -" N+M pages range from N to N+M-1\n" -" N,M pages range from N to M-1\n" -" N, pages range from N to end\n" -" ,M pages range from 0 to M-1\n" +" N one page at offset N (unit: pages)\n" +" N+M pages range from N to N+M-1\n" +" N,M pages range from N to M-1\n" +" N, pages range from N to end\n" +" ,M pages range from 0 to M-1\n" "bits-spec:\n" -" bit1,bit2 (flags & (bit1|bit2)) != 0\n" -" bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n" -" bit1,~bit2 (flags & (bit1|bit2)) == bit1\n" -" =bit1,bit2 flags == (bit1|bit2)\n" +" bit1,bit2 (flags & (bit1|bit2)) != 0\n" +" bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n" +" bit1,~bit2 (flags & (bit1|bit2)) == bit1\n" +" =bit1,bit2 flags == (bit1|bit2)\n" "bit-names:\n" ); @@ -884,13 +889,23 @@ static void parse_bits_mask(const char *optarg) add_bits_filter(mask, bits); } +static void describe_flags(const char *optarg) +{ + uint64_t flags = parse_flag_names(optarg, 0); -static struct option opts[] = { + printf("0x%016llx\t%s\t%s\n", + (unsigned long long)flags, + page_flag_name(flags), + page_flag_longname(flags)); +} + +static const struct option opts[] = { { "raw" , 0, NULL, 'r' }, { "pid" , 1, NULL, 'p' }, { "file" , 1, NULL, 'f' }, { "addr" , 1, NULL, 'a' }, { "bits" , 1, NULL, 'b' }, + { "describe" , 1, NULL, 'd' }, { "list" , 0, NULL, 'l' }, { "list-each" , 0, NULL, 'L' }, { "no-summary", 0, NULL, 'N' }, @@ -907,7 +922,7 @@ int main(int argc, char *argv[]) page_size = getpagesize(); while ((c = getopt_long(argc, argv, - "rp:f:a:b:lLNXxh", opts, NULL)) != -1) { + "rp:f:a:b:d:lLNXxh", opts, NULL)) != -1) { switch (c) { case 'r': opt_raw = 1; @@ -924,6 +939,9 @@ int main(int argc, char *argv[]) case 'b': parse_bits_mask(optarg); break; + case 'd': + describe_flags(optarg); + exit(0); case 'l': opt_list = 1; break; |