aboutsummaryrefslogblamecommitdiffstats
path: root/drivers/usb/serial/mct_u232.h
blob: 07b6bec31dc898a2e6f9fcccc5ff1ab654b328c6 (plain) (tree)
































































                                                                                 

                                                                          
   


                                                                            
  

                                     






                                                                             
                                                                                                   







































































































































































































































































































































































                                                                                     
                                                                  










                                                                      
/*
 * Definitions for MCT (Magic Control Technology) USB-RS232 Converter Driver
 *
 *   Copyright (C) 2000 Wolfgang Grandegger (wolfgang@ces.ch)
 *
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 * This driver is for the device MCT USB-RS232 Converter (25 pin, Model No.
 * U232-P25) from Magic Control Technology Corp. (there is also a 9 pin
 * Model No. U232-P9). See http://www.mct.com.tw/p_u232.html for further
 * information. The properties of this device are listed at the end of this
 * file. This device is available from various distributors. I know Hana,
 * http://www.hana.de and D-Link, http://www.dlink.com/products/usb/dsbs25.
 *
 * All of the information about the device was acquired by using SniffUSB
 * on Windows98. The technical details of the reverse engineering are
 * summarized at the end of this file.
 */

#ifndef __LINUX_USB_SERIAL_MCT_U232_H
#define __LINUX_USB_SERIAL_MCT_U232_H

#define MCT_U232_VID	                0x0711	/* Vendor Id */
#define MCT_U232_PID	                0x0210	/* Original MCT Product Id */

/* U232-P25, Sitecom */
#define MCT_U232_SITECOM_PID		0x0230	/* Sitecom Product Id */

/* DU-H3SP USB BAY hub */
#define MCT_U232_DU_H3SP_PID		0x0200	/* D-Link DU-H3SP USB BAY */

/* Belkin badge the MCT U232-P9 as the F5U109 */
#define MCT_U232_BELKIN_F5U109_VID	0x050d	/* Vendor Id */
#define MCT_U232_BELKIN_F5U109_PID	0x0109	/* Product Id */

/*
 * Vendor Request Interface
 */
#define MCT_U232_SET_REQUEST_TYPE	0x40
#define MCT_U232_GET_REQUEST_TYPE	0xc0

#define MCT_U232_GET_MODEM_STAT_REQUEST 2  /* Get Modem Status Register (MSR) */
#define MCT_U232_GET_MODEM_STAT_SIZE    1

#define MCT_U232_GET_LINE_CTRL_REQUEST  6  /* Get Line Control Register (LCR) */
#define MCT_U232_GET_LINE_CTRL_SIZE     1  /* ... not used by this driver */

#define MCT_U232_SET_BAUD_RATE_REQUEST	5  /* Set Baud Rate Divisor */
#define MCT_U232_SET_BAUD_RATE_SIZE     4

#define MCT_U232_SET_LINE_CTRL_REQUEST	7  /* Set Line Control Register (LCR) */
#define MCT_U232_SET_LINE_CTRL_SIZE     1

#define MCT_U232_SET_MODEM_CTRL_REQUEST	10 /* Set Modem Control Register (MCR) */
#define MCT_U232_SET_MODEM_CTRL_SIZE    1

/* This USB device request code is not well understood.  It is transmitted by
   the MCT-supplied Windows driver whenever the baud rate changes. 
*/
#define MCT_U232_SET_UNKNOWN1_REQUEST   11  /* Unknown functionality */
#define MCT_U232_SET_UNKNOWN1_SIZE       1

/* This USB device request code appears to control whether CTS is required
   during transmission.
   
   Sending a zero byte allows data transmission to a device which is not
   asserting CTS.  Sending a '1' byte will cause transmission to be deferred
   until the device asserts CTS.
*/
#define MCT_U232_SET_CTS_REQUEST   12
#define MCT_U232_SET_CTS_SIZE       1

/*
 * Baud rate (divisor)
 * Actually, there are two of them, MCT website calls them "Philips solution"
 * and "Intel solution". They are the regular MCT and "Sitecom" for us.
 * This is pointless to document in the header, see the code for the bits.
 */
static int mct_u232_calculate_baud_rate(struct usb_serial *serial, speed_t value, speed_t *result);

/*
 * Line Control Register (LCR)
 */
#define MCT_U232_SET_BREAK              0x40

#define MCT_U232_PARITY_SPACE		0x38
#define MCT_U232_PARITY_MARK		0x28
#define MCT_U232_PARITY_EVEN		0x18
#define MCT_U232_PARITY_ODD		0x08
#define MCT_U232_PARITY_NONE		0x00

#define MCT_U232_DATA_BITS_5            0x00
#define MCT_U232_DATA_BITS_6            0x01
#define MCT_U232_DATA_BITS_7            0x02
#define MCT_U232_DATA_BITS_8            0x03

#define MCT_U232_STOP_BITS_2            0x04
#define MCT_U232_STOP_BITS_1            0x00

/*
 * Modem Control Register (MCR)
 */
#define MCT_U232_MCR_NONE               0x8     /* Deactivate DTR and RTS */
#define MCT_U232_MCR_RTS                0xa     /* Activate RTS */
#define MCT_U232_MCR_DTR                0x9     /* Activate DTR */

/*
 * Modem Status Register (MSR)
 */
#define MCT_U232_MSR_INDEX              0x0     /* data[index] */
#define MCT_U232_MSR_CD                 0x80    /* Current CD */
#define MCT_U232_MSR_RI                 0x40    /* Current RI */
#define MCT_U232_MSR_DSR                0x20    /* Current DSR */
#define MCT_U232_MSR_CTS                0x10    /* Current CTS */
#define MCT_U232_MSR_DCD                0x08    /* Delta CD */
#define MCT_U232_MSR_DRI                0x04    /* Delta RI */
#define MCT_U232_MSR_DDSR               0x02    /* Delta DSR */
#define MCT_U232_MSR_DCTS               0x01    /* Delta CTS */

/*
 * Line Status Register (LSR)
 */
#define MCT_U232_LSR_INDEX              1       /* data[index] */
#define MCT_U232_LSR_ERR                0x80    /* OE | PE | FE | BI */
#define MCT_U232_LSR_TEMT               0x40    /* transmit register empty */
#define MCT_U232_LSR_THRE               0x20    /* transmit holding register empty */
#define MCT_U232_LSR_BI                 0x10    /* break indicator */
#define MCT_U232_LSR_FE                 0x08    /* framing error */
#define MCT_U232_LSR_OE                 0x02    /* overrun error */
#define MCT_U232_LSR_PE                 0x04    /* parity error */
#define MCT_U232_LSR_OE                 0x02    /* overrun error */
#define MCT_U232_LSR_DR                 0x01    /* receive data ready */


/* -----------------------------------------------------------------------------
 * Technical Specification reverse engineered with SniffUSB on Windows98
 * =====================================================================
 *
 *  The technical details of the device have been acquired be using "SniffUSB"
 *  and the vendor-supplied device driver (version 2.3A) under Windows98. To
 *  identify the USB vendor-specific requests and to assign them to terminal 
 *  settings (flow control, baud rate, etc.) the program "SerialSettings" from
 *  William G. Greathouse has been proven to be very useful. I also used the
 *  Win98 "HyperTerminal" and "usb-robot" on Linux for testing. The results and 
 *  observations are summarized below:
 *
 *  The USB requests seem to be directly mapped to the registers of a 8250,
 *  16450 or 16550 UART. The FreeBSD handbook (appendix F.4 "Input/Output
 *  devices") contains a comprehensive description of UARTs and its registers.
 *  The bit descriptions are actually taken from there.
 *
 *
 * Baud rate (divisor)
 * -------------------
 *
 *   BmRequestType:  0x40 (0100 0000B)
 *   bRequest:       0x05
 *   wValue:         0x0000
 *   wIndex:         0x0000
 *   wLength:        0x0004
 *   Data:           divisor = 115200 / baud_rate
 *
 *   SniffUSB observations (Nov 2003): Contrary to the 'wLength' value of 4
 *   shown above, observations with a Belkin F5U109 adapter, using the
 *   MCT-supplied Windows98 driver (U2SPORT.VXD, "File version: 1.21P.0104 for
 *   Win98/Me"), show this request has a length of 1 byte, presumably because
 *   of the fact that the Belkin adapter and the 'Sitecom U232-P25' adapter
 *   use a baud-rate code instead of a conventional RS-232 baud rate divisor.
 *   The current source code for this driver does not reflect this fact, but
 *   the driver works fine with this adapter/driver combination nonetheless.
 *
 *
 * Line Control Register (LCR)
 * ---------------------------
 *
 *  BmRequestType:  0x40 (0100 0000B)    0xc0 (1100 0000B)
 *  bRequest:       0x07                 0x06
 *  wValue:         0x0000
 *  wIndex:         0x0000
 *  wLength:        0x0001
 *  Data:           LCR (see below)
 *
 *  Bit 7: Divisor Latch Access Bit (DLAB). When set, access to the data
 *  	   transmit/receive register (THR/RBR) and the Interrupt Enable Register
 *  	   (IER) is disabled. Any access to these ports is now redirected to the
 *  	   Divisor Latch Registers. Setting this bit, loading the Divisor
 *  	   Registers, and clearing DLAB should be done with interrupts disabled.
 *  Bit 6: Set Break. When set to "1", the transmitter begins to transmit
 *  	   continuous Spacing until this bit is set to "0". This overrides any
 *  	   bits of characters that are being transmitted.
 *  Bit 5: Stick Parity. When parity is enabled, setting this bit causes parity
 *  	   to always be "1" or "0", based on the value of Bit 4.
 *  Bit 4: Even Parity Select (EPS). When parity is enabled and Bit 5 is "0",
 *  	   setting this bit causes even parity to be transmitted and expected.
 *  	   Otherwise, odd parity is used.
 *  Bit 3: Parity Enable (PEN). When set to "1", a parity bit is inserted
 *  	   between the last bit of the data and the Stop Bit. The UART will also
 *  	   expect parity to be present in the received data.
 *  Bit 2: Number of Stop Bits (STB). If set to "1" and using 5-bit data words,
 *  	   1.5 Stop Bits are transmitted and expected in each data word. For
 *  	   6, 7 and 8-bit data words, 2 Stop Bits are transmitted and expected.
 *  	   When this bit is set to "0", one Stop Bit is used on each data word.
 *  Bit 1: Word Length Select Bit #1 (WLSB1)
 *  Bit 0: Word Length Select Bit #0 (WLSB0)
 *  	   Together these bits specify the number of bits in each data word.
 *  	     1 0  Word Length
 *  	     0 0  5 Data Bits
 *  	     0 1  6 Data Bits
 *  	     1 0  7 Data Bits
 *  	     1 1  8 Data Bits
 *
 *  SniffUSB observations: Bit 7 seems not to be used. There seem to be two bugs
 *  in the Win98 driver: the break does not work (bit 6 is not asserted) and the
 *  stick parity bit is not cleared when set once. The LCR can also be read
 *  back with USB request 6 but this has never been observed with SniffUSB.
 *
 *
 * Modem Control Register (MCR)
 * ----------------------------
 *
 *  BmRequestType:  0x40  (0100 0000B)
 *  bRequest:       0x0a
 *  wValue:         0x0000
 *  wIndex:         0x0000
 *  wLength:        0x0001
 *  Data:           MCR (Bit 4..7, see below)
 *
 *  Bit 7: Reserved, always 0.
 *  Bit 6: Reserved, always 0.
 *  Bit 5: Reserved, always 0.
 *  Bit 4: Loop-Back Enable. When set to "1", the UART transmitter and receiver
 *  	   are internally connected together to allow diagnostic operations. In
 *  	   addition, the UART modem control outputs are connected to the UART
 *  	   modem control inputs. CTS is connected to RTS, DTR is connected to
 *  	   DSR, OUT1 is connected to RI, and OUT 2 is connected to DCD.
 *  Bit 3: OUT 2. An auxiliary output that the host processor may set high or
 *  	   low. In the IBM PC serial adapter (and most clones), OUT 2 is used
 *  	   to tri-state (disable) the interrupt signal from the
 *  	   8250/16450/16550 UART.
 *  Bit 2: OUT 1. An auxiliary output that the host processor may set high or
 *  	   low. This output is not used on the IBM PC serial adapter.
 *  Bit 1: Request to Send (RTS). When set to "1", the output of the UART -RTS
 *  	   line is Low (Active).
 *  Bit 0: Data Terminal Ready (DTR). When set to "1", the output of the UART
 *  	   -DTR line is Low (Active).
 *
 *  SniffUSB observations: Bit 2 and 4 seem not to be used but bit 3 has been
 *  seen _always_ set.
 *
 *
 * Modem Status Register (MSR)
 * ---------------------------
 *
 *  BmRequestType:  0xc0  (1100 0000B)
 *  bRequest:       0x02
 *  wValue:         0x0000
 *  wIndex:         0x0000
 *  wLength:        0x0001
 *  Data:           MSR (see below)
 *
 *  Bit 7: Data Carrier Detect (CD). Reflects the state of the DCD line on the
 *  	   UART.
 *  Bit 6: Ring Indicator (RI). Reflects the state of the RI line on the UART.
 *  Bit 5: Data Set Ready (DSR). Reflects the state of the DSR line on the UART.
 *  Bit 4: Clear To Send (CTS). Reflects the state of the CTS line on the UART.
 *  Bit 3: Delta Data Carrier Detect (DDCD). Set to "1" if the -DCD line has
 *  	   changed state one more more times since the last time the MSR was
 *  	   read by the host.
 *  Bit 2: Trailing Edge Ring Indicator (TERI). Set to "1" if the -RI line has
 *  	   had a low to high transition since the last time the MSR was read by
 *  	   the host.
 *  Bit 1: Delta Data Set Ready (DDSR). Set to "1" if the -DSR line has changed
 *  	   state one more more times since the last time the MSR was read by the
 *  	   host.
 *  Bit 0: Delta Clear To Send (DCTS). Set to "1" if the -CTS line has changed
 *  	   state one more times since the last time the MSR was read by the
 *  	   host.
 *
 *  SniffUSB observations: the MSR is also returned as first byte on the
 *  interrupt-in endpoint 0x83 to signal changes of modem status lines. The USB
 *  request to read MSR cannot be applied during normal device operation.
 *
 *
 * Line Status Register (LSR)
 * --------------------------
 *
 *  Bit 7   Error in Receiver FIFO. On the 8250/16450 UART, this bit is zero.
 *  	    This bit is set to "1" when any of the bytes in the FIFO have one or
 *  	    more of the following error conditions: PE, FE, or BI.
 *  Bit 6   Transmitter Empty (TEMT). When set to "1", there are no words
 *  	    remaining in the transmit FIFO or the transmit shift register. The
 *  	    transmitter is completely idle.
 *  Bit 5   Transmitter Holding Register Empty (THRE). When set to "1", the FIFO
 *  	    (or holding register) now has room for at least one additional word
 *  	    to transmit. The transmitter may still be transmitting when this bit
 *  	    is set to "1".
 *  Bit 4   Break Interrupt (BI). The receiver has detected a Break signal.
 *  Bit 3   Framing Error (FE). A Start Bit was detected but the Stop Bit did not
 *  	    appear at the expected time. The received word is probably garbled.
 *  Bit 2   Parity Error (PE). The parity bit was incorrect for the word received.
 *  Bit 1   Overrun Error (OE). A new word was received and there was no room in
 *  	    the receive buffer. The newly-arrived word in the shift register is
 *  	    discarded. On 8250/16450 UARTs, the word in the holding register is
 *  	    discarded and the newly- arrived word is put in the holding register.
 *  Bit 0   Data Ready (DR). One or more words are in the receive FIFO that the
 *  	    host may read. A word must be completely received and moved from the
 *  	    shift register into the FIFO (or holding register for 8250/16450
 *  	    designs) before this bit is set.
 *
 *  SniffUSB observations: the LSR is returned as second byte on the interrupt-in
 *  endpoint 0x83 to signal error conditions. Such errors have been seen with
 *  minicom/zmodem transfers (CRC errors).
 *
 *
 * Unknown #1
 * -------------------
 *
 *   BmRequestType:  0x40 (0100 0000B)
 *   bRequest:       0x0b
 *   wValue:         0x0000
 *   wIndex:         0x0000
 *   wLength:        0x0001
 *   Data:           0x00
 *
 *   SniffUSB observations (Nov 2003): With the MCT-supplied Windows98 driver
 *   (U2SPORT.VXD, "File version: 1.21P.0104 for Win98/Me"), this request
 *   occurs immediately after a "Baud rate (divisor)" message.  It was not
 *   observed at any other time.  It is unclear what purpose this message
 *   serves.
 *
 *
 * Unknown #2
 * -------------------
 *
 *   BmRequestType:  0x40 (0100 0000B)
 *   bRequest:       0x0c
 *   wValue:         0x0000
 *   wIndex:         0x0000
 *   wLength:        0x0001
 *   Data:           0x00
 *
 *   SniffUSB observations (Nov 2003): With the MCT-supplied Windows98 driver
 *   (U2SPORT.VXD, "File version: 1.21P.0104 for Win98/Me"), this request
 *   occurs immediately after the 'Unknown #1' message (see above).  It was
 *   not observed at any other time.  It is unclear what other purpose (if
 *   any) this message might serve, but without it, the USB/RS-232 adapter
 *   will not write to RS-232 devices which do not assert the 'CTS' signal.
 *
 *
 * Flow control
 * ------------
 *
 *  SniffUSB observations: no flow control specific requests have been realized
 *  apart from DTR/RTS settings. Both signals are dropped for no flow control
 *  but asserted for hardware or software flow control.
 *
 *
 * Endpoint usage
 * --------------
 *
 *  SniffUSB observations: the bulk-out endpoint 0x1 and interrupt-in endpoint
 *  0x81 is used to transmit and receive characters. The second interrupt-in 
 *  endpoint 0x83 signals exceptional conditions like modem line changes and 
 *  errors. The first byte returned is the MSR and the second byte the LSR.
 *
 *
 * Other observations
 * ------------------
 *
 *  Queued bulk transfers like used in visor.c did not work. 
 *  
 *
 * Properties of the USB device used (as found in /var/log/messages)
 * -----------------------------------------------------------------
 *
 *  Manufacturer: MCT Corporation.
 *  Product: USB-232 Interfact Controller
 *  SerialNumber: U2S22050
 *
 *    Length              = 18
 *    DescriptorType      = 01
 *    USB version         = 1.00
 *    Vendor:Product      = 0711:0210
 *    MaxPacketSize0      = 8
 *    NumConfigurations   = 1
 *    Device version      = 1.02
 *    Device Class:SubClass:Protocol = 00:00:00
 *      Per-interface classes
 *  Configuration:
 *    bLength             =    9
 *    bDescriptorType     =   02
 *    wTotalLength        = 0027
 *    bNumInterfaces      =   01
 *    bConfigurationValue =   01
 *    iConfiguration      =   00
 *    bmAttributes        =   c0
 *    MaxPower            =  100mA
 *
 *    Interface: 0
 *    Alternate Setting:  0
 *      bLength             =    9
 *      bDescriptorType     =   04
 *      bInterfaceNumber    =   00
 *      bAlternateSetting   =   00
 *      bNumEndpoints       =   03
 *      bInterface Class:SubClass:Protocol =   00:00:00
 *      iInterface          =   00
 *      Endpoint:
 * 	  bLength             =    7
 * 	  bDescriptorType     =   05
 * 	  bEndpointAddress    =   81 (in)
 * 	  bmAttributes        =   03 (Interrupt)
 * 	  wMaxPacketSize      = 0040
 * 	  bInterval           =   02
 *      Endpoint:
 * 	  bLength             =    7
 * 	  bDescriptorType     =   05
 * 	  bEndpointAddress    =   01 (out)
 * 	  bmAttributes        =   02 (Bulk)
 * 	  wMaxPacketSize      = 0040
 * 	  bInterval           =   00
 *      Endpoint:
 * 	  bLength             =    7
 * 	  bDescriptorType     =   05
 * 	  bEndpointAddress    =   83 (in)
 * 	  bmAttributes        =   03 (Interrupt)
 * 	  wMaxPacketSize      = 0002
 * 	  bInterval           =   02
 *
 *
 * Hardware details (added by Martin Hamilton, 2001/12/06)
 * -----------------------------------------------------------------
 *
 * This info was gleaned from opening a Belkin F5U109 DB9 USB serial
 * adaptor, which turns out to simply be a re-badged U232-P9.  We
 * know this because there is a sticky label on the circuit board
 * which says "U232-P9" ;-)
 * 
 * The circuit board inside the adaptor contains a Philips PDIUSBD12
 * USB endpoint chip and a Philips P87C52UBAA microcontroller with
 * embedded UART.  Exhaustive documentation for these is available at:
 *
 *   http://www.semiconductors.philips.com/pip/p87c52ubaa
 *   http://www.semiconductors.philips.com/pip/pdiusbd12
 *
 * Thanks to Julian Highfield for the pointer to the Philips database.
 * 
 */

#endif /* __LINUX_USB_SERIAL_MCT_U232_H */

609' href='#n1609'>1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453
/*
 *  Fast Userspace Mutexes (which I call "Futexes!").
 *  (C) Rusty Russell, IBM 2002
 *
 *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
 *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
 *
 *  Removed page pinning, fix privately mapped COW pages and other cleanups
 *  (C) Copyright 2003, 2004 Jamie Lokier
 *
 *  Robust futex support started by Ingo Molnar
 *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
 *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
 *
 *  PI-futex support started by Ingo Molnar and Thomas Gleixner
 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
 *
 *  PRIVATE futexes by Eric Dumazet
 *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
 *
 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
 *  enough at me, Linus for the original (flawed) idea, Matthew
 *  Kirkwood for proof-of-concept implementation.
 *
 *  "The futexes are also cursed."
 *  "But they come in a choice of three flavours!"
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/futex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
#include <linux/module.h>
#include <asm/futex.h>

#include "rtmutex_common.h"

#ifdef CONFIG_DEBUG_RT_MUTEXES
# include "rtmutex-debug.h"
#else
# include "rtmutex.h"
#endif

#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)

/*
 * Priority Inheritance state:
 */
struct futex_pi_state {
	/*
	 * list of 'owned' pi_state instances - these have to be
	 * cleaned up in do_exit() if the task exits prematurely:
	 */
	struct list_head list;

	/*
	 * The PI object:
	 */
	struct rt_mutex pi_mutex;

	struct task_struct *owner;
	atomic_t refcount;

	union futex_key key;
};

/*
 * We use this hashed waitqueue instead of a normal wait_queue_t, so
 * we can wake only the relevant ones (hashed queues may be shared).
 *
 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
 * The order of wakup is always to make the first condition true, then
 * wake up q->waiters, then make the second condition true.
 */
struct futex_q {
	struct plist_node list;
	wait_queue_head_t waiters;

	/* Which hash list lock to use: */
	spinlock_t *lock_ptr;

	/* Key which the futex is hashed on: */
	union futex_key key;

	/* For fd, sigio sent using these: */
	int fd;
	struct file *filp;

	/* Optional priority inheritance state: */
	struct futex_pi_state *pi_state;
	struct task_struct *task;

	/*
	 * This waiter is used in case of requeue from a
	 * normal futex to a PI-futex
	 */
	struct rt_mutex_waiter waiter;
};

/*
 * Split the global futex_lock into every hash list lock.
 */
struct futex_hash_bucket {
	spinlock_t lock;
	struct plist_head chain;
};

static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];

/* Futex-fs vfsmount entry: */
static struct vfsmount *futex_mnt;

/*
 * We hash on the keys returned from get_futex_key (see below).
 */
static struct futex_hash_bucket *hash_futex(union futex_key *key)
{
	u32 hash = jhash2((u32*)&key->both.word,
			  (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
			  key->both.offset);
	return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
}

/*
 * Return 1 if two futex_keys are equal, 0 otherwise.
 */
static inline int match_futex(union futex_key *key1, union futex_key *key2)
{
	return (key1->both.word == key2->both.word
		&& key1->both.ptr == key2->both.ptr
		&& key1->both.offset == key2->both.offset);
}

/**
 * get_futex_key - Get parameters which are the keys for a futex.
 * @uaddr: virtual address of the futex
 * @shared: NULL for a PROCESS_PRIVATE futex,
 *	&current->mm->mmap_sem for a PROCESS_SHARED futex
 * @key: address where result is stored.
 *
 * Returns a negative error code or 0
 * The key words are stored in *key on success.
 *
 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
 * offset_within_page).  For private mappings, it's (uaddr, current->mm).
 * We can usually work out the index without swapping in the page.
 *
 * fshared is NULL for PROCESS_PRIVATE futexes
 * For other futexes, it points to &current->mm->mmap_sem and
 * caller must have taken the reader lock. but NOT any spinlocks.
 */
int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
		  union futex_key *key)
{
	unsigned long address = (unsigned long)uaddr;
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
	struct page *page;
	int err;

	/*
	 * The futex address must be "naturally" aligned.
	 */
	key->both.offset = address % PAGE_SIZE;
	if (unlikely((address % sizeof(u32)) != 0))
		return -EINVAL;
	address -= key->both.offset;

	/*
	 * PROCESS_PRIVATE futexes are fast.
	 * As the mm cannot disappear under us and the 'key' only needs
	 * virtual address, we dont even have to find the underlying vma.
	 * Note : We do have to check 'uaddr' is a valid user address,
	 *        but access_ok() should be faster than find_vma()
	 */
	if (!fshared) {
		if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
			return -EFAULT;
		key->private.mm = mm;
		key->private.address = address;
		return 0;
	}
	/*
	 * The futex is hashed differently depending on whether
	 * it's in a shared or private mapping.  So check vma first.
	 */
	vma = find_extend_vma(mm, address);
	if (unlikely(!vma))
		return -EFAULT;

	/*
	 * Permissions.
	 */
	if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
		return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;

	/* Save the user address in the ley */
	key->uaddr = uaddr;

	/*
	 * Private mappings are handled in a simple way.
	 *
	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
	 * it's a read-only handle, it's expected that futexes attach to
	 * the object not the particular process.  Therefore we use
	 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
	 * mappings of _writable_ handles.
	 */
	if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
		key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
		key->private.mm = mm;
		key->private.address = address;
		return 0;
	}

	/*
	 * Linear file mappings are also simple.
	 */
	key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
	key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
	if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
		key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
				     + vma->vm_pgoff);
		return 0;
	}

	/*
	 * We could walk the page table to read the non-linear
	 * pte, and get the page index without fetching the page
	 * from swap.  But that's a lot of code to duplicate here
	 * for a rare case, so we simply fetch the page.
	 */
	err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
	if (err >= 0) {
		key->shared.pgoff =
			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
		put_page(page);
		return 0;
	}
	return err;
}
EXPORT_SYMBOL_GPL(get_futex_key);

/*
 * Take a reference to the resource addressed by a key.
 * Can be called while holding spinlocks.
 *
 */
inline void get_futex_key_refs(union futex_key *key)
{
	if (key->both.ptr == 0)
		return;
	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
		case FUT_OFF_INODE:
			atomic_inc(&key->shared.inode->i_count);
			break;
		case FUT_OFF_MMSHARED:
			atomic_inc(&key->private.mm->mm_count);
			break;
	}
}
EXPORT_SYMBOL_GPL(get_futex_key_refs);

/*
 * Drop a reference to the resource addressed by a key.
 * The hash bucket spinlock must not be held.
 */
void drop_futex_key_refs(union futex_key *key)
{
	if (key->both.ptr == 0)
		return;
	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
		case FUT_OFF_INODE:
			iput(key->shared.inode);
			break;
		case FUT_OFF_MMSHARED:
			mmdrop(key->private.mm);
			break;
	}
}
EXPORT_SYMBOL_GPL(drop_futex_key_refs);

static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
{
	int ret;

	pagefault_disable();
	ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
	pagefault_enable();

	return ret ? -EFAULT : 0;
}

/*
 * Fault handling.
 * if fshared is non NULL, current->mm->mmap_sem is already held
 */
static int futex_handle_fault(unsigned long address,
			      struct rw_semaphore *fshared, int attempt)
{
	struct vm_area_struct * vma;
	struct mm_struct *mm = current->mm;
	int ret = -EFAULT;

	if (attempt > 2)
		return ret;

	if (!fshared)
		down_read(&mm->mmap_sem);
	vma = find_vma(mm, address);
	if (vma && address >= vma->vm_start &&
	    (vma->vm_flags & VM_WRITE)) {
		switch (handle_mm_fault(mm, vma, address, 1)) {
		case VM_FAULT_MINOR:
			ret = 0;
			current->min_flt++;
			break;
		case VM_FAULT_MAJOR:
			ret = 0;
			current->maj_flt++;
			break;
		}
	}
	if (!fshared)
		up_read(&mm->mmap_sem);
	return ret;
}

/*
 * PI code:
 */
static int refill_pi_state_cache(void)
{
	struct futex_pi_state *pi_state;

	if (likely(current->pi_state_cache))
		return 0;

	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);

	if (!pi_state)
		return -ENOMEM;

	INIT_LIST_HEAD(&pi_state->list);
	/* pi_mutex gets initialized later */
	pi_state->owner = NULL;
	atomic_set(&pi_state->refcount, 1);

	current->pi_state_cache = pi_state;

	return 0;
}

static struct futex_pi_state * alloc_pi_state(void)
{
	struct futex_pi_state *pi_state = current->pi_state_cache;

	WARN_ON(!pi_state);
	current->pi_state_cache = NULL;

	return pi_state;
}

static void free_pi_state(struct futex_pi_state *pi_state)
{
	if (!atomic_dec_and_test(&pi_state->refcount))
		return;

	/*
	 * If pi_state->owner is NULL, the owner is most probably dying
	 * and has cleaned up the pi_state already
	 */
	if (pi_state->owner) {
		spin_lock_irq(&pi_state->owner->pi_lock);
		list_del_init(&pi_state->list);
		spin_unlock_irq(&pi_state->owner->pi_lock);

		rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
	}

	if (current->pi_state_cache)
		kfree(pi_state);
	else {
		/*
		 * pi_state->list is already empty.
		 * clear pi_state->owner.
		 * refcount is at 0 - put it back to 1.
		 */
		pi_state->owner = NULL;
		atomic_set(&pi_state->refcount, 1);
		current->pi_state_cache = pi_state;
	}
}

/*
 * Look up the task based on what TID userspace gave us.
 * We dont trust it.
 */
static struct task_struct * futex_find_get_task(pid_t pid)
{
	struct task_struct *p;

	rcu_read_lock();
	p = find_task_by_pid(pid);
	if (!p)
		goto out_unlock;
	if ((current->euid != p->euid) && (current->euid != p->uid)) {
		p = NULL;
		goto out_unlock;
	}
	if (p->exit_state != 0) {
		p = NULL;
		goto out_unlock;
	}
	get_task_struct(p);
out_unlock:
	rcu_read_unlock();

	return p;
}

/*
 * This task is holding PI mutexes at exit time => bad.
 * Kernel cleans up PI-state, but userspace is likely hosed.
 * (Robust-futex cleanup is separate and might save the day for userspace.)
 */
void exit_pi_state_list(struct task_struct *curr)
{
	struct list_head *next, *head = &curr->pi_state_list;
	struct futex_pi_state *pi_state;
	struct futex_hash_bucket *hb;
	union futex_key key;

	/*
	 * We are a ZOMBIE and nobody can enqueue itself on
	 * pi_state_list anymore, but we have to be careful
	 * versus waiters unqueueing themselves:
	 */
	spin_lock_irq(&curr->pi_lock);
	while (!list_empty(head)) {

		next = head->next;
		pi_state = list_entry(next, struct futex_pi_state, list);
		key = pi_state->key;
		hb = hash_futex(&key);
		spin_unlock_irq(&curr->pi_lock);

		spin_lock(&hb->lock);

		spin_lock_irq(&curr->pi_lock);
		/*
		 * We dropped the pi-lock, so re-check whether this
		 * task still owns the PI-state:
		 */
		if (head->next != next) {
			spin_unlock(&hb->lock);
			continue;
		}

		WARN_ON(pi_state->owner != curr);
		WARN_ON(list_empty(&pi_state->list));
		list_del_init(&pi_state->list);
		pi_state->owner = NULL;
		spin_unlock_irq(&curr->pi_lock);

		rt_mutex_unlock(&pi_state->pi_mutex);

		spin_unlock(&hb->lock);

		spin_lock_irq(&curr->pi_lock);
	}
	spin_unlock_irq(&curr->pi_lock);
}

static int
lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
		union futex_key *key, struct futex_pi_state **ps)
{
	struct futex_pi_state *pi_state = NULL;
	struct futex_q *this, *next;
	struct plist_head *head;
	struct task_struct *p;
	pid_t pid;

	head = &hb->chain;

	plist_for_each_entry_safe(this, next, head, list) {
		if (match_futex(&this->key, key)) {
			/*
			 * Another waiter already exists - bump up
			 * the refcount and return its pi_state:
			 */
			pi_state = this->pi_state;
			/*
			 * Userspace might have messed up non PI and PI futexes
			 */
			if (unlikely(!pi_state))
				return -EINVAL;

			WARN_ON(!atomic_read(&pi_state->refcount));

			atomic_inc(&pi_state->refcount);
			*ps = pi_state;

			return 0;
		}
	}

	/*
	 * We are the first waiter - try to look up the real owner and attach
	 * the new pi_state to it, but bail out when the owner died bit is set
	 * and TID = 0:
	 */
	pid = uval & FUTEX_TID_MASK;
	if (!pid && (uval & FUTEX_OWNER_DIED))
		return -ESRCH;
	p = futex_find_get_task(pid);
	if (!p)
		return -ESRCH;

	pi_state = alloc_pi_state();

	/*
	 * Initialize the pi_mutex in locked state and make 'p'
	 * the owner of it:
	 */
	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);

	/* Store the key for possible exit cleanups: */
	pi_state->key = *key;

	spin_lock_irq(&p->pi_lock);
	WARN_ON(!list_empty(&pi_state->list));
	list_add(&pi_state->list, &p->pi_state_list);
	pi_state->owner = p;
	spin_unlock_irq(&p->pi_lock);

	put_task_struct(p);

	*ps = pi_state;

	return 0;
}

/*
 * The hash bucket lock must be held when this is called.
 * Afterwards, the futex_q must not be accessed.
 */
static void wake_futex(struct futex_q *q)
{
	plist_del(&q->list, &q->list.plist);
	if (q->filp)
		send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
	/*
	 * The lock in wake_up_all() is a crucial memory barrier after the
	 * plist_del() and also before assigning to q->lock_ptr.
	 */
	wake_up_all(&q->waiters);
	/*
	 * The waiting task can free the futex_q as soon as this is written,
	 * without taking any locks.  This must come last.
	 *
	 * A memory barrier is required here to prevent the following store
	 * to lock_ptr from getting ahead of the wakeup. Clearing the lock
	 * at the end of wake_up_all() does not prevent this store from
	 * moving.
	 */
	smp_wmb();
	q->lock_ptr = NULL;
}

static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
{
	struct task_struct *new_owner;
	struct futex_pi_state *pi_state = this->pi_state;
	u32 curval, newval;

	if (!pi_state)
		return -EINVAL;

	spin_lock(&pi_state->pi_mutex.wait_lock);
	new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);

	/*
	 * This happens when we have stolen the lock and the original
	 * pending owner did not enqueue itself back on the rt_mutex.
	 * Thats not a tragedy. We know that way, that a lock waiter
	 * is on the fly. We make the futex_q waiter the pending owner.
	 */
	if (!new_owner)
		new_owner = this->task;

	/*
	 * We pass it to the next owner. (The WAITERS bit is always
	 * kept enabled while there is PI state around. We must also
	 * preserve the owner died bit.)
	 */
	if (!(uval & FUTEX_OWNER_DIED)) {
		newval = FUTEX_WAITERS | new_owner->pid;
		/* Keep the FUTEX_WAITER_REQUEUED flag if it was set */
		newval |= (uval & FUTEX_WAITER_REQUEUED);

		pagefault_disable();
		curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
		pagefault_enable();
		if (curval == -EFAULT)
			return -EFAULT;
		if (curval != uval)
			return -EINVAL;
	}

	spin_lock_irq(&pi_state->owner->pi_lock);
	WARN_ON(list_empty(&pi_state->list));
	list_del_init(&pi_state->list);
	spin_unlock_irq(&pi_state->owner->pi_lock);

	spin_lock_irq(&new_owner->pi_lock);
	WARN_ON(!list_empty(&pi_state->list));
	list_add(&pi_state->list, &new_owner->pi_state_list);
	pi_state->owner = new_owner;
	spin_unlock_irq(&new_owner->pi_lock);

	spin_unlock(&pi_state->pi_mutex.wait_lock);
	rt_mutex_unlock(&pi_state->pi_mutex);

	return 0;
}

static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
{
	u32 oldval;

	/*
	 * There is no waiter, so we unlock the futex. The owner died
	 * bit has not to be preserved here. We are the owner:
	 */
	pagefault_disable();
	oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
	pagefault_enable();

	if (oldval == -EFAULT)
		return oldval;
	if (oldval != uval)
		return -EAGAIN;

	return 0;
}

/*
 * Express the locking dependencies for lockdep:
 */
static inline void
double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
{
	if (hb1 <= hb2) {
		spin_lock(&hb1->lock);
		if (hb1 < hb2)
			spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
	} else { /* hb1 > hb2 */
		spin_lock(&hb2->lock);
		spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
	}
}

/*
 * Wake up all waiters hashed on the physical page that is mapped
 * to this virtual address:
 */
static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
		      int nr_wake)
{
	struct futex_hash_bucket *hb;
	struct futex_q *this, *next;
	struct plist_head *head;
	union futex_key key;
	int ret;

	if (fshared)
		down_read(fshared);

	ret = get_futex_key(uaddr, fshared, &key);
	if (unlikely(ret != 0))
		goto out;

	hb = hash_futex(&key);
	spin_lock(&hb->lock);
	head = &hb->chain;

	plist_for_each_entry_safe(this, next, head, list) {
		if (match_futex (&this->key, &key)) {
			if (this->pi_state) {
				ret = -EINVAL;
				break;
			}
			wake_futex(this);
			if (++ret >= nr_wake)
				break;
		}
	}

	spin_unlock(&hb->lock);
out:
	if (fshared)
		up_read(fshared);
	return ret;
}

/*
 * Called from futex_requeue_pi.
 * Set FUTEX_WAITERS and FUTEX_WAITER_REQUEUED flags on the
 * PI-futex value; search its associated pi_state if an owner exist
 * or create a new one without owner.
 */
static inline int
lookup_pi_state_for_requeue(u32 __user *uaddr, struct futex_hash_bucket *hb,
			    union futex_key *key,
			    struct futex_pi_state **pi_state)
{
	u32 curval, uval, newval;

retry:
	/*
	 * We can't handle a fault cleanly because we can't
	 * release the locks here. Simply return the fault.
	 */
	if (get_futex_value_locked(&curval, uaddr))
		return -EFAULT;

	/* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */
	if ((curval & (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED))
	    != (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) {
		/*
		 * No waiters yet, we prepare the futex to have some waiters.
		 */

		uval = curval;
		newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED;

		pagefault_disable();
		curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
		pagefault_enable();

		if (unlikely(curval == -EFAULT))
			return -EFAULT;
		if (unlikely(curval != uval))
			goto retry;
	}

	if (!(curval & FUTEX_TID_MASK)
	    || lookup_pi_state(curval, hb, key, pi_state)) {
		/* the futex has no owner (yet) or the lookup failed:
		   allocate one pi_state without owner */

		*pi_state = alloc_pi_state();

		/* Already stores the key: */
		(*pi_state)->key = *key;

		/* init the mutex without owner */
		__rt_mutex_init(&(*pi_state)->pi_mutex, NULL);
	}

	return 0;
}

/*
 * Keep the first nr_wake waiter from futex1, wake up one,
 * and requeue the next nr_requeue waiters following hashed on
 * one physical page to another physical page (PI-futex uaddr2)
 */
static int futex_requeue_pi(u32 __user *uaddr1,
			    struct rw_semaphore *fshared,
			    u32 __user *uaddr2,
			    int nr_wake, int nr_requeue, u32 *cmpval)
{
	union futex_key key1, key2;
	struct futex_hash_bucket *hb1, *hb2;
	struct plist_head *head1;
	struct futex_q *this, *next;
	struct futex_pi_state *pi_state2 = NULL;
	struct rt_mutex_waiter *waiter, *top_waiter = NULL;
	struct rt_mutex *lock2 = NULL;
	int ret, drop_count = 0;

	if (refill_pi_state_cache())
		return -ENOMEM;

retry:
	/*
	 * First take all the futex related locks:
	 */
	if (fshared)
		down_read(fshared);

	ret = get_futex_key(uaddr1, fshared, &key1);
	if (unlikely(ret != 0))
		goto out;
	ret = get_futex_key(uaddr2, fshared, &key2);
	if (unlikely(ret != 0))
		goto out;

	hb1 = hash_futex(&key1);
	hb2 = hash_futex(&key2);

	double_lock_hb(hb1, hb2);

	if (likely(cmpval != NULL)) {
		u32 curval;

		ret = get_futex_value_locked(&curval, uaddr1);

		if (unlikely(ret)) {
			spin_unlock(&hb1->lock);
			if (hb1 != hb2)
				spin_unlock(&hb2->lock);

			/*
			 * If we would have faulted, release mmap_sem, fault
			 * it in and start all over again.
			 */
			if (fshared)
				up_read(fshared);

			ret = get_user(curval, uaddr1);

			if (!ret)
				goto retry;

			return ret;
		}
		if (curval != *cmpval) {
			ret = -EAGAIN;
			goto out_unlock;
		}
	}

	head1 = &hb1->chain;
	plist_for_each_entry_safe(this, next, head1, list) {
		if (!match_futex (&this->key, &key1))
			continue;
		if (++ret <= nr_wake) {
			wake_futex(this);
		} else {
			/*
			 * FIRST: get and set the pi_state
			 */
			if (!pi_state2) {
				int s;
				/* do this only the first time we requeue someone */
				s = lookup_pi_state_for_requeue(uaddr2, hb2,
								&key2, &pi_state2);
				if (s) {
					ret = s;
					goto out_unlock;
				}

				lock2 = &pi_state2->pi_mutex;
				spin_lock(&lock2->wait_lock);

				/* Save the top waiter of the wait_list */
				if (rt_mutex_has_waiters(lock2))
					top_waiter = rt_mutex_top_waiter(lock2);
			} else
				atomic_inc(&pi_state2->refcount);


			this->pi_state = pi_state2;

			/*
			 * SECOND: requeue futex_q to the correct hashbucket
			 */

			/*
			 * If key1 and key2 hash to the same bucket, no need to
			 * requeue.
			 */
			if (likely(head1 != &hb2->chain)) {
				plist_del(&this->list, &hb1->chain);
				plist_add(&this->list, &hb2->chain);
				this->lock_ptr = &hb2->lock;
#ifdef CONFIG_DEBUG_PI_LIST
				this->list.plist.lock = &hb2->lock;
#endif
			}
			this->key = key2;
			get_futex_key_refs(&key2);
			drop_count++;


			/*
			 * THIRD: queue it to lock2
			 */
			spin_lock_irq(&this->task->pi_lock);
			waiter = &this->waiter;
			waiter->task = this->task;
			waiter->lock = lock2;
			plist_node_init(&waiter->list_entry, this->task->prio);
			plist_node_init(&waiter->pi_list_entry, this->task->prio);
			plist_add(&waiter->list_entry, &lock2->wait_list);
			this->task->pi_blocked_on = waiter;
			spin_unlock_irq(&this->task->pi_lock);

			if (ret - nr_wake >= nr_requeue)
				break;
		}
	}

	/* If we've requeued some tasks and the top_waiter of the rt_mutex
	   has changed, we must adjust the priority of the owner, if any */
	if (drop_count) {
		struct task_struct *owner = rt_mutex_owner(lock2);
		if (owner &&
		    (top_waiter != (waiter = rt_mutex_top_waiter(lock2)))) {
			int chain_walk = 0;

			spin_lock_irq(&owner->pi_lock);
			if (top_waiter)
				plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
			else
				/*
				 * There was no waiters before the requeue,
				 * the flag must be updated
				 */
				mark_rt_mutex_waiters(lock2);

			plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
			__rt_mutex_adjust_prio(owner);
			if (owner->pi_blocked_on) {
				chain_walk = 1;
				get_task_struct(owner);
			}

			spin_unlock_irq(&owner->pi_lock);
			spin_unlock(&lock2->wait_lock);

			if (chain_walk)
				rt_mutex_adjust_prio_chain(owner, 0, lock2, NULL,
							   current);
		} else {
			/* No owner or the top_waiter does not change */
			mark_rt_mutex_waiters(lock2);
			spin_unlock(&lock2->wait_lock);
		}
	}

out_unlock:
	spin_unlock(&hb1->lock);
	if (hb1 != hb2)
		spin_unlock(&hb2->lock);

	/* drop_futex_key_refs() must be called outside the spinlocks. */
	while (--drop_count >= 0)
		drop_futex_key_refs(&key1);

out:
	if (fshared)
		up_read(fshared);
	return ret;
}

/*
 * Wake up all waiters hashed on the physical page that is mapped
 * to this virtual address:
 */
static int
futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
	      u32 __user *uaddr2,
	      int nr_wake, int nr_wake2, int op)
{
	union futex_key key1, key2;
	struct futex_hash_bucket *hb1, *hb2;
	struct plist_head *head;
	struct futex_q *this, *next;
	int ret, op_ret, attempt = 0;

retryfull:
	if (fshared)
		down_read(fshared);

	ret = get_futex_key(uaddr1, fshared, &key1);
	if (unlikely(ret != 0))
		goto out;
	ret = get_futex_key(uaddr2, fshared, &key2);
	if (unlikely(ret != 0))
		goto out;

	hb1 = hash_futex(&key1);
	hb2 = hash_futex(&key2);

retry:
	double_lock_hb(hb1, hb2);

	op_ret = futex_atomic_op_inuser(op, uaddr2);
	if (unlikely(op_ret < 0)) {
		u32 dummy;

		spin_unlock(&hb1->lock);
		if (hb1 != hb2)
			spin_unlock(&hb2->lock);

#ifndef CONFIG_MMU
		/*
		 * we don't get EFAULT from MMU faults if we don't have an MMU,
		 * but we might get them from range checking
		 */
		ret = op_ret;
		goto out;
#endif

		if (unlikely(op_ret != -EFAULT)) {
			ret = op_ret;
			goto out;
		}

		/*
		 * futex_atomic_op_inuser needs to both read and write
		 * *(int __user *)uaddr2, but we can't modify it
		 * non-atomically.  Therefore, if get_user below is not
		 * enough, we need to handle the fault ourselves, while
		 * still holding the mmap_sem.
		 */
		if (attempt++) {
			ret = futex_handle_fault((unsigned long)uaddr2,
						fshared, attempt);
			if (ret)
				goto out;
			goto retry;
		}

		/*
		 * If we would have faulted, release mmap_sem,
		 * fault it in and start all over again.
		 */
		if (fshared)
			up_read(fshared);

		ret = get_user(dummy, uaddr2);
		if (ret)
			return ret;

		goto retryfull;
	}

	head = &hb1->chain;

	plist_for_each_entry_safe(this, next, head, list) {
		if (match_futex (&this->key, &key1)) {
			wake_futex(this);
			if (++ret >= nr_wake)
				break;
		}
	}

	if (op_ret > 0) {
		head = &hb2->chain;

		op_ret = 0;
		plist_for_each_entry_safe(this, next, head, list) {
			if (match_futex (&this->key, &key2)) {
				wake_futex(this);
				if (++op_ret >= nr_wake2)
					break;
			}
		}
		ret += op_ret;
	}

	spin_unlock(&hb1->lock);
	if (hb1 != hb2)
		spin_unlock(&hb2->lock);
out:
	if (fshared)
		up_read(fshared);
	return ret;
}

/*
 * Requeue all waiters hashed on one physical page to another
 * physical page.
 */
static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
			 u32 __user *uaddr2,
			 int nr_wake, int nr_requeue, u32 *cmpval)
{
	union futex_key key1, key2;
	struct futex_hash_bucket *hb1, *hb2;
	struct plist_head *head1;
	struct futex_q *this, *next;
	int ret, drop_count = 0;

 retry:
	if (fshared)
		down_read(fshared);

	ret = get_futex_key(uaddr1, fshared, &key1);
	if (unlikely(ret != 0))
		goto out;
	ret = get_futex_key(uaddr2, fshared, &key2);
	if (unlikely(ret != 0))
		goto out;

	hb1 = hash_futex(&key1);
	hb2 = hash_futex(&key2);

	double_lock_hb(hb1, hb2);

	if (likely(cmpval != NULL)) {
		u32 curval;

		ret = get_futex_value_locked(&curval, uaddr1);

		if (unlikely(ret)) {
			spin_unlock(&hb1->lock);
			if (hb1 != hb2)
				spin_unlock(&hb2->lock);

			/*
			 * If we would have faulted, release mmap_sem, fault
			 * it in and start all over again.
			 */
			if (fshared)
				up_read(fshared);

			ret = get_user(curval, uaddr1);

			if (!ret)
				goto retry;

			return ret;
		}
		if (curval != *cmpval) {
			ret = -EAGAIN;
			goto out_unlock;
		}
	}

	head1 = &hb1->chain;
	plist_for_each_entry_safe(this, next, head1, list) {
		if (!match_futex (&this->key, &key1))
			continue;
		if (++ret <= nr_wake) {
			wake_futex(this);
		} else {
			/*
			 * If key1 and key2 hash to the same bucket, no need to
			 * requeue.
			 */
			if (likely(head1 != &hb2->chain)) {
				plist_del(&this->list, &hb1->chain);
				plist_add(&this->list, &hb2->chain);
				this->lock_ptr = &hb2->lock;
#ifdef CONFIG_DEBUG_PI_LIST
				this->list.plist.lock = &hb2->lock;
#endif
 			}
			this->key = key2;
			get_futex_key_refs(&key2);
			drop_count++;

			if (ret - nr_wake >= nr_requeue)
				break;
		}
	}

out_unlock:
	spin_unlock(&hb1->lock);
	if (hb1 != hb2)
		spin_unlock(&hb2->lock);

	/* drop_futex_key_refs() must be called outside the spinlocks. */
	while (--drop_count >= 0)
		drop_futex_key_refs(&key1);

out:
	if (fshared)
		up_read(fshared);
	return ret;
}

/* The key must be already stored in q->key. */
static inline struct futex_hash_bucket *
queue_lock(struct futex_q *q, int fd, struct file *filp)
{
	struct futex_hash_bucket *hb;

	q->fd = fd;
	q->filp = filp;

	init_waitqueue_head(&q->waiters);

	get_futex_key_refs(&q->key);
	hb = hash_futex(&q->key);
	q->lock_ptr = &hb->lock;

	spin_lock(&hb->lock);
	return hb;
}

static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
	int prio;

	/*
	 * The priority used to register this element is
	 * - either the real thread-priority for the real-time threads
	 * (i.e. threads with a priority lower than MAX_RT_PRIO)
	 * - or MAX_RT_PRIO for non-RT threads.
	 * Thus, all RT-threads are woken first in priority order, and
	 * the others are woken last, in FIFO order.
	 */
	prio = min(current->normal_prio, MAX_RT_PRIO);

	plist_node_init(&q->list, prio);
#ifdef CONFIG_DEBUG_PI_LIST
	q->list.plist.lock = &hb->lock;
#endif
	plist_add(&q->list, &hb->chain);
	q->task = current;
	spin_unlock(&hb->lock);
}

static inline void
queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
{
	spin_unlock(&hb->lock);
	drop_futex_key_refs(&q->key);
}

/*
 * queue_me and unqueue_me must be called as a pair, each
 * exactly once.  They are called with the hashed spinlock held.
 */

/* The key must be already stored in q->key. */
static void queue_me(struct futex_q *q, int fd, struct file *filp)
{
	struct futex_hash_bucket *hb;

	hb = queue_lock(q, fd, filp);
	__queue_me(q, hb);
}

/* Return 1 if we were still queued (ie. 0 means we were woken) */
static int unqueue_me(struct futex_q *q)
{
	spinlock_t *lock_ptr;
	int ret = 0;

	/* In the common case we don't take the spinlock, which is nice. */
 retry:
	lock_ptr = q->lock_ptr;
	barrier();
	if (lock_ptr != 0) {
		spin_lock(lock_ptr);
		/*
		 * q->lock_ptr can change between reading it and
		 * spin_lock(), causing us to take the wrong lock.  This
		 * corrects the race condition.
		 *
		 * Reasoning goes like this: if we have the wrong lock,
		 * q->lock_ptr must have changed (maybe several times)
		 * between reading it and the spin_lock().  It can
		 * change again after the spin_lock() but only if it was
		 * already changed before the spin_lock().  It cannot,
		 * however, change back to the original value.  Therefore
		 * we can detect whether we acquired the correct lock.
		 */
		if (unlikely(lock_ptr != q->lock_ptr)) {
			spin_unlock(lock_ptr);
			goto retry;
		}
		WARN_ON(plist_node_empty(&q->list));
		plist_del(&q->list, &q->list.plist);

		BUG_ON(q->pi_state);

		spin_unlock(lock_ptr);
		ret = 1;
	}

	drop_futex_key_refs(&q->key);
	return ret;
}

/*
 * PI futexes can not be requeued and must remove themself from the
 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
 * and dropped here.
 */
static void unqueue_me_pi(struct futex_q *q)
{
	WARN_ON(plist_node_empty(&q->list));
	plist_del(&q->list, &q->list.plist);

	BUG_ON(!q->pi_state);
	free_pi_state(q->pi_state);
	q->pi_state = NULL;

	spin_unlock(q->lock_ptr);

	drop_futex_key_refs(&q->key);
}

/*
 * Fixup the pi_state owner with current.
 *
 * The cur->mm semaphore must be  held, it is released at return of this
 * function.
 */
static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared,
				struct futex_q *q,
				struct futex_hash_bucket *hb,
				struct task_struct *curr)
{
	u32 newtid = curr->pid | FUTEX_WAITERS;
	struct futex_pi_state *pi_state = q->pi_state;
	u32 uval, curval, newval;
	int ret;

	/* Owner died? */
	if (pi_state->owner != NULL) {
		spin_lock_irq(&pi_state->owner->pi_lock);
		WARN_ON(list_empty(&pi_state->list));
		list_del_init(&pi_state->list);
		spin_unlock_irq(&pi_state->owner->pi_lock);
	} else
		newtid |= FUTEX_OWNER_DIED;

	pi_state->owner = curr;

	spin_lock_irq(&curr->pi_lock);
	WARN_ON(!list_empty(&pi_state->list));
	list_add(&pi_state->list, &curr->pi_state_list);
	spin_unlock_irq(&curr->pi_lock);

	/* Unqueue and drop the lock */
	unqueue_me_pi(q);
	if (fshared)
		up_read(fshared);
	/*
	 * We own it, so we have to replace the pending owner
	 * TID. This must be atomic as we have preserve the
	 * owner died bit here.
	 */
	ret = get_user(uval, uaddr);
	while (!ret) {
		newval = (uval & FUTEX_OWNER_DIED) | newtid;
		newval |= (uval & FUTEX_WAITER_REQUEUED);
		curval = futex_atomic_cmpxchg_inatomic(uaddr,
						       uval, newval);
		if (curval == -EFAULT)
 			ret = -EFAULT;
		if (curval == uval)
			break;
		uval = curval;
	}
	return ret;
}

/*
 * In case we must use restart_block to restart a futex_wait,
 * we encode in the 'arg3' shared capability
 */
#define ARG3_SHARED  1

static long futex_wait_restart(struct restart_block *restart);
static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
		      u32 val, ktime_t *abs_time)
{
	struct task_struct *curr = current;
	DECLARE_WAITQUEUE(wait, curr);
	struct futex_hash_bucket *hb;
	struct futex_q q;
	u32 uval;
	int ret;
	struct hrtimer_sleeper t, *to = NULL;
	int rem = 0;

	q.pi_state = NULL;
 retry:
	if (fshared)
		down_read(fshared);

	ret = get_futex_key(uaddr, fshared, &q.key);
	if (unlikely(ret != 0))
		goto out_release_sem;

	hb = queue_lock(&q, -1, NULL);

	/*
	 * Access the page AFTER the futex is queued.
	 * Order is important:
	 *
	 *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
	 *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
	 *
	 * The basic logical guarantee of a futex is that it blocks ONLY
	 * if cond(var) is known to be true at the time of blocking, for
	 * any cond.  If we queued after testing *uaddr, that would open
	 * a race condition where we could block indefinitely with
	 * cond(var) false, which would violate the guarantee.
	 *
	 * A consequence is that futex_wait() can return zero and absorb
	 * a wakeup when *uaddr != val on entry to the syscall.  This is
	 * rare, but normal.
	 *
	 * for shared futexes, we hold the mmap semaphore, so the mapping
	 * cannot have changed since we looked it up in get_futex_key.
	 */
	ret = get_futex_value_locked(&uval, uaddr);

	if (unlikely(ret)) {
		queue_unlock(&q, hb);

		/*
		 * If we would have faulted, release mmap_sem, fault it in and
		 * start all over again.
		 */
		if (fshared)
			up_read(fshared);

		ret = get_user(uval, uaddr);

		if (!ret)
			goto retry;
		return ret;
	}
	ret = -EWOULDBLOCK;
	if (uval != val)
		goto out_unlock_release_sem;

	/*
	 * This rt_mutex_waiter structure is prepared here and will
	 * be used only if this task is requeued from a normal futex to
	 * a PI-futex with futex_requeue_pi.
	 */
	debug_rt_mutex_init_waiter(&q.waiter);
	q.waiter.task = NULL;

	/* Only actually queue if *uaddr contained val.  */
	__queue_me(&q, hb);

	/*
	 * Now the futex is queued and we have checked the data, we
	 * don't want to hold mmap_sem while we sleep.
	 */
	if (fshared)
		up_read(fshared);

	/*
	 * There might have been scheduling since the queue_me(), as we
	 * cannot hold a spinlock across the get_user() in case it
	 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
	 * queueing ourselves into the futex hash.  This code thus has to
	 * rely on the futex_wake() code removing us from hash when it
	 * wakes us up.
	 */

	/* add_wait_queue is the barrier after __set_current_state. */
	__set_current_state(TASK_INTERRUPTIBLE);
	add_wait_queue(&q.waiters, &wait);
	/*
	 * !plist_node_empty() is safe here without any lock.
	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
	 */
	if (likely(!plist_node_empty(&q.list))) {
		if (!abs_time)
			schedule();
		else {
			to = &t;
			hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
			hrtimer_init_sleeper(&t, current);
			t.timer.expires = *abs_time;

			hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS);

			/*
			 * the timer could have already expired, in which
			 * case current would be flagged for rescheduling.
			 * Don't bother calling schedule.
			 */
			if (likely(t.task))
				schedule();

			hrtimer_cancel(&t.timer);

			/* Flag if a timeout occured */
			rem = (t.task == NULL);
		}
	}
	__set_current_state(TASK_RUNNING);

	/*
	 * NOTE: we don't remove ourselves from the waitqueue because
	 * we are the only user of it.
	 */

	if (q.pi_state) {
		/*
		 * We were woken but have been requeued on a PI-futex.
		 * We have to complete the lock acquisition by taking
		 * the rtmutex.
		 */

		struct rt_mutex *lock = &q.pi_state->pi_mutex;

		spin_lock(&lock->wait_lock);
		if (unlikely(q.waiter.task)) {
			remove_waiter(lock, &q.waiter);
		}
		spin_unlock(&lock->wait_lock);

		if (rem)
			ret = -ETIMEDOUT;
		else
			ret = rt_mutex_timed_lock(lock, to, 1);

		if (fshared)
			down_read(fshared);
		spin_lock(q.lock_ptr);

		/*
		 * Got the lock. We might not be the anticipated owner if we
		 * did a lock-steal - fix up the PI-state in that case.
		 */
		if (!ret && q.pi_state->owner != curr) {
			/*
			 * We MUST play with the futex we were requeued on,
			 * NOT the current futex.
			 * We can retrieve it from the key of the pi_state
			 */
			uaddr = q.pi_state->key.uaddr;

			/* mmap_sem and hash_bucket lock are unlocked at
			   return of this function */
			ret = fixup_pi_state_owner(uaddr, fshared,
						   &q, hb, curr);
		} else {
			/*
			 * Catch the rare case, where the lock was released
			 * when we were on the way back before we locked
			 * the hash bucket.
			 */
			if (ret && q.pi_state->owner == curr) {
				if (rt_mutex_trylock(&q.pi_state->pi_mutex))
					ret = 0;
			}
			/* Unqueue and drop the lock */
			unqueue_me_pi(&q);
			if (fshared)
				up_read(fshared);
		}

		debug_rt_mutex_free_waiter(&q.waiter);

		return ret;
	}

	debug_rt_mutex_free_waiter(&q.waiter);

	/* If we were woken (and unqueued), we succeeded, whatever. */
	if (!unqueue_me(&q))
		return 0;
	if (rem)
		return -ETIMEDOUT;

	/*
	 * We expect signal_pending(current), but another thread may
	 * have handled it for us already.
	 */
	if (!abs_time)
		return -ERESTARTSYS;
	else {
		struct restart_block *restart;
		restart = &current_thread_info()->restart_block;
		restart->fn = futex_wait_restart;
		restart->arg0 = (unsigned long)uaddr;
		restart->arg1 = (unsigned long)val;
		restart->arg2 = (unsigned long)abs_time;
		restart->arg3 = 0;
		if (fshared)
			restart->arg3 |= ARG3_SHARED;
		return -ERESTART_RESTARTBLOCK;
	}

 out_unlock_release_sem:
	queue_unlock(&q, hb);

 out_release_sem:
	if (fshared)
		up_read(fshared);
	return ret;
}


static long futex_wait_restart(struct restart_block *restart)
{
	u32 __user *uaddr = (u32 __user *)restart->arg0;
	u32 val = (u32)restart->arg1;
	ktime_t *abs_time = (ktime_t *)restart->arg2;
	struct rw_semaphore *fshared = NULL;

	restart->fn = do_no_restart_syscall;
	if (restart->arg3 & ARG3_SHARED)
		fshared = &current->mm->mmap_sem;
	return (long)futex_wait(uaddr, fshared, val, abs_time);
}


static void set_pi_futex_owner(struct futex_hash_bucket *hb,
			       union futex_key *key, struct task_struct *p)
{
	struct plist_head *head;
	struct futex_q *this, *next;
	struct futex_pi_state *pi_state = NULL;
	struct rt_mutex *lock;

	/* Search a waiter that should already exists */

	head = &hb->chain;

	plist_for_each_entry_safe(this, next, head, list) {
		if (match_futex (&this->key, key)) {
			pi_state = this->pi_state;
			break;
		}
	}

	BUG_ON(!pi_state);

	/* set p as pi_state's owner */
	lock = &pi_state->pi_mutex;

	spin_lock(&lock->wait_lock);
	spin_lock_irq(&p->pi_lock);

	list_add(&pi_state->list, &p->pi_state_list);
	pi_state->owner = p;


	/* set p as pi_mutex's owner */
	debug_rt_mutex_proxy_lock(lock, p);
	WARN_ON(rt_mutex_owner(lock));
	rt_mutex_set_owner(lock, p, 0);
	rt_mutex_deadlock_account_lock(lock, p);

	plist_add(&rt_mutex_top_waiter(lock)->pi_list_entry,
		  &p->pi_waiters);
	__rt_mutex_adjust_prio(p);

	spin_unlock_irq(&p->pi_lock);
	spin_unlock(&lock->wait_lock);
}

/*
 * Userspace tried a 0 -> TID atomic transition of the futex value
 * and failed. The kernel side here does the whole locking operation:
 * if there are waiters then it will block, it does PI, etc. (Due to
 * races the kernel might see a 0 value of the futex too.)
 */
static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
			 int detect, ktime_t *time, int trylock)
{
	struct hrtimer_sleeper timeout, *to = NULL;
	struct task_struct *curr = current;
	struct futex_hash_bucket *hb;
	u32 uval, newval, curval;
	struct futex_q q;
	int ret, lock_held, attempt = 0;

	if (refill_pi_state_cache())
		return -ENOMEM;

	if (time) {
		to = &timeout;
		hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
		hrtimer_init_sleeper(to, current);
		to->timer.expires = *time;
	}

	q.pi_state = NULL;
 retry:
	if (fshared)
		down_read(fshared);

	ret = get_futex_key(uaddr, fshared, &q.key);
	if (unlikely(ret != 0))
		goto out_release_sem;

	hb = queue_lock(&q, -1, NULL);

 retry_locked:
	lock_held = 0;

	/*
	 * To avoid races, we attempt to take the lock here again
	 * (by doing a 0 -> TID atomic cmpxchg), while holding all
	 * the locks. It will most likely not succeed.
	 */
	newval = current->pid;

	pagefault_disable();
	curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
	pagefault_enable();

	if (unlikely(curval == -EFAULT))
		goto uaddr_faulted;

	/* We own the lock already */
	if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
		if (!detect && 0)
			force_sig(SIGKILL, current);
		/*
		 * Normally, this check is done in user space.
		 * In case of requeue, the owner may attempt to lock this futex,
		 * even if the ownership has already been given by the previous
		 * waker.
		 * In the usual case, this is a case of deadlock, but not in case
		 * of REQUEUE_PI.
		 */
		if (!(curval & FUTEX_WAITER_REQUEUED))
			ret = -EDEADLK;
		goto out_unlock_release_sem;
	}

	/*
	 * Surprise - we got the lock. Just return
	 * to userspace:
	 */
	if (unlikely(!curval))
		goto out_unlock_release_sem;

	uval = curval;
	/*
	 * In case of a requeue, check if there already is an owner
	 * If not, just take the futex.
	 */
	if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) {
		/* set current as futex owner */
		newval = curval | current->pid;
		lock_held = 1;
	} else
		/* Set the WAITERS flag, so the owner will know it has someone
		   to wake at next unlock */
		newval = curval | FUTEX_WAITERS;

	pagefault_disable();
	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
	pagefault_enable();

	if (unlikely(curval == -EFAULT))
		goto uaddr_faulted;
	if (unlikely(curval != uval))
		goto retry_locked;

	if (lock_held) {
		set_pi_futex_owner(hb, &q.key, curr);
		goto out_unlock_release_sem;
	}

	/*
	 * We dont have the lock. Look up the PI state (or create it if
	 * we are the first waiter):
	 */
	ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);

	if (unlikely(ret)) {
		/*
		 * There were no waiters and the owner task lookup
		 * failed. When the OWNER_DIED bit is set, then we
		 * know that this is a robust futex and we actually
		 * take the lock. This is safe as we are protected by
		 * the hash bucket lock. We also set the waiters bit
		 * unconditionally here, to simplify glibc handling of
		 * multiple tasks racing to acquire the lock and
		 * cleanup the problems which were left by the dead
		 * owner.
		 */
		if (curval & FUTEX_OWNER_DIED) {
			uval = newval;
			newval = current->pid |
				FUTEX_OWNER_DIED | FUTEX_WAITERS;

			pagefault_disable();
			curval = futex_atomic_cmpxchg_inatomic(uaddr,
							       uval, newval);
			pagefault_enable();

			if (unlikely(curval == -EFAULT))
				goto uaddr_faulted;
			if (unlikely(curval != uval))
				goto retry_locked;
			ret = 0;
		}
		goto out_unlock_release_sem;
	}

	/*
	 * Only actually queue now that the atomic ops are done:
	 */
	__queue_me(&q, hb);

	/*
	 * Now the futex is queued and we have checked the data, we
	 * don't want to hold mmap_sem while we sleep.
	 */
	if (fshared)
		up_read(fshared);

	WARN_ON(!q.pi_state);
	/*
	 * Block on the PI mutex:
	 */
	if (!trylock)
		ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
	else {
		ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
		/* Fixup the trylock return value: */
		ret = ret ? 0 : -EWOULDBLOCK;
	}

	if (fshared)
		down_read(fshared);
	spin_lock(q.lock_ptr);

	/*
	 * Got the lock. We might not be the anticipated owner if we
	 * did a lock-steal - fix up the PI-state in that case.
	 */
	if (!ret && q.pi_state->owner != curr)
		/* mmap_sem is unlocked at return of this function */
		ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr);
	else {
		/*
		 * Catch the rare case, where the lock was released
		 * when we were on the way back before we locked
		 * the hash bucket.
		 */
		if (ret && q.pi_state->owner == curr) {
			if (rt_mutex_trylock(&q.pi_state->pi_mutex))
				ret = 0;
		}
		/* Unqueue and drop the lock */
		unqueue_me_pi(&q);
		if (fshared)
			up_read(fshared);
	}

	if (!detect && ret == -EDEADLK && 0)
		force_sig(SIGKILL, current);

	return ret != -EINTR ? ret : -ERESTARTNOINTR;

 out_unlock_release_sem:
	queue_unlock(&q, hb);

 out_release_sem:
	if (fshared)
		up_read(fshared);
	return ret;

 uaddr_faulted:
	/*
	 * We have to r/w  *(int __user *)uaddr, but we can't modify it
	 * non-atomically.  Therefore, if get_user below is not
	 * enough, we need to handle the fault ourselves, while
	 * still holding the mmap_sem.
	 */
	if (attempt++) {
		ret = futex_handle_fault((unsigned long)uaddr, fshared,
					 attempt);
		if (ret)
			goto out_unlock_release_sem;
		goto retry_locked;
	}

	queue_unlock(&q, hb);
	if (fshared)
		up_read(fshared);

	ret = get_user(uval, uaddr);
	if (!ret && (uval != -EFAULT))
		goto retry;

	return ret;
}

/*
 * Userspace attempted a TID -> 0 atomic transition, and failed.
 * This is the in-kernel slowpath: we look up the PI state (if any),
 * and do the rt-mutex unlock.
 */
static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
{
	struct futex_hash_bucket *hb;
	struct futex_q *this, *next;
	u32 uval;
	struct plist_head *head;
	union futex_key key;
	int ret, attempt = 0;

retry:
	if (get_user(uval, uaddr))
		return -EFAULT;
	/*
	 * We release only a lock we actually own:
	 */
	if ((uval & FUTEX_TID_MASK) != current->pid)
		return -EPERM;
	/*
	 * First take all the futex related locks:
	 */
	if (fshared)
		down_read(fshared);

	ret = get_futex_key(uaddr, fshared, &key);
	if (unlikely(ret != 0))
		goto out;

	hb = hash_futex(&key);
	spin_lock(&hb->lock);

retry_locked:
	/*
	 * To avoid races, try to do the TID -> 0 atomic transition
	 * again. If it succeeds then we can return without waking
	 * anyone else up:
	 */
	if (!(uval & FUTEX_OWNER_DIED)) {
		pagefault_disable();
		uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
		pagefault_enable();
	}

	if (unlikely(uval == -EFAULT))
		goto pi_faulted;
	/*
	 * Rare case: we managed to release the lock atomically,
	 * no need to wake anyone else up:
	 */
	if (unlikely(uval == current->pid))
		goto out_unlock;

	/*
	 * Ok, other tasks may need to be woken up - check waiters
	 * and do the wakeup if necessary:
	 */
	head = &hb->chain;

	plist_for_each_entry_safe(this, next, head, list) {
		if (!match_futex (&this->key, &key))
			continue;
		ret = wake_futex_pi(uaddr, uval, this);
		/*
		 * The atomic access to the futex value
		 * generated a pagefault, so retry the
		 * user-access and the wakeup:
		 */
		if (ret == -EFAULT)
			goto pi_faulted;
		goto out_unlock;
	}
	/*
	 * No waiters - kernel unlocks the futex:
	 */
	if (!(uval & FUTEX_OWNER_DIED)) {
		ret = unlock_futex_pi(uaddr, uval);
		if (ret == -EFAULT)
			goto pi_faulted;
	}

out_unlock:
	spin_unlock(&hb->lock);
out:
	if (fshared)
		up_read(fshared);

	return ret;

pi_faulted:
	/*
	 * We have to r/w  *(int __user *)uaddr, but we can't modify it
	 * non-atomically.  Therefore, if get_user below is not
	 * enough, we need to handle the fault ourselves, while
	 * still holding the mmap_sem.
	 */
	if (attempt++) {
		ret = futex_handle_fault((unsigned long)uaddr, fshared,
					 attempt);
		if (ret)
			goto out_unlock;
		goto retry_locked;
	}

	spin_unlock(&hb->lock);
	if (fshared)
		up_read(fshared);

	ret = get_user(uval, uaddr);
	if (!ret && (uval != -EFAULT))
		goto retry;

	return ret;
}

static int futex_close(struct inode *inode, struct file *filp)
{
	struct futex_q *q = filp->private_data;

	unqueue_me(q);
	kfree(q);

	return 0;
}

/* This is one-shot: once it's gone off you need a new fd */
static unsigned int futex_poll(struct file *filp,
			       struct poll_table_struct *wait)
{
	struct futex_q *q = filp->private_data;
	int ret = 0;

	poll_wait(filp, &q->waiters, wait);

	/*
	 * plist_node_empty() is safe here without any lock.
	 * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
	 */
	if (plist_node_empty(&q->list))
		ret = POLLIN | POLLRDNORM;

	return ret;
}

static const struct file_operations futex_fops = {
	.release	= futex_close,
	.poll		= futex_poll,
};

/*
 * Signal allows caller to avoid the race which would occur if they
 * set the sigio stuff up afterwards.
 */
static int futex_fd(u32 __user *uaddr, int signal)
{
	struct futex_q *q;
	struct file *filp;
	int ret, err;
	struct rw_semaphore *fshared;
	static unsigned long printk_interval;

	if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
		printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
		    	"will be removed from the kernel in June 2007\n",
			current->comm);
	}

	ret = -EINVAL;
	if (!valid_signal(signal))
		goto out;

	ret = get_unused_fd();
	if (ret < 0)
		goto out;
	filp = get_empty_filp();
	if (!filp) {
		put_unused_fd(ret);
		ret = -ENFILE;
		goto out;
	}
	filp->f_op = &futex_fops;
	filp->f_path.mnt = mntget(futex_mnt);
	filp->f_path.dentry = dget(futex_mnt->mnt_root);
	filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;

	if (signal) {
		err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
		if (err < 0) {
			goto error;
		}
		filp->f_owner.signum = signal;
	}

	q = kmalloc(sizeof(*q), GFP_KERNEL);
	if (!q) {
		err = -ENOMEM;
		goto error;
	}
	q->pi_state = NULL;

	fshared = &current->mm->mmap_sem;
	down_read(fshared);
	err = get_futex_key(uaddr, fshared, &q->key);

	if (unlikely(err != 0)) {
		up_read(fshared);
		kfree(q);
		goto error;
	}

	/*
	 * queue_me() must be called before releasing mmap_sem, because
	 * key->shared.inode needs to be referenced while holding it.
	 */
	filp->private_data = q;

	queue_me(q, ret, filp);
	up_read(fshared);

	/* Now we map fd to filp, so userspace can access it */
	fd_install(ret, filp);
out:
	return ret;
error:
	put_unused_fd(ret);
	put_filp(filp);
	ret = err;
	goto out;
}

/*
 * Support for robust futexes: the kernel cleans up held futexes at
 * thread exit time.
 *
 * Implementation: user-space maintains a per-thread list of locks it
 * is holding. Upon do_exit(), the kernel carefully walks this list,
 * and marks all locks that are owned by this thread with the
 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
 * always manipulated with the lock held, so the list is private and
 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
 * field, to allow the kernel to clean up if the thread dies after
 * acquiring the lock, but just before it could have added itself to
 * the list. There can only be one such pending lock.
 */

/**
 * sys_set_robust_list - set the robust-futex list head of a task
 * @head: pointer to the list-head
 * @len: length of the list-head, as userspace expects
 */
asmlinkage long
sys_set_robust_list(struct robust_list_head __user *head,
		    size_t len)
{
	/*
	 * The kernel knows only one size for now:
	 */
	if (unlikely(len != sizeof(*head)))
		return -EINVAL;

	current->robust_list = head;

	return 0;
}

/**
 * sys_get_robust_list - get the robust-futex list head of a task
 * @pid: pid of the process [zero for current task]
 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
 * @len_ptr: pointer to a length field, the kernel fills in the header size
 */
asmlinkage long
sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
		    size_t __user *len_ptr)
{
	struct robust_list_head __user *head;
	unsigned long ret;

	if (!pid)
		head = current->robust_list;
	else {
		struct task_struct *p;

		ret = -ESRCH;
		rcu_read_lock();
		p = find_task_by_pid(pid);
		if (!p)
			goto err_unlock;
		ret = -EPERM;
		if ((current->euid != p->euid) && (current->euid != p->uid) &&
				!capable(CAP_SYS_PTRACE))
			goto err_unlock;
		head = p->robust_list;
		rcu_read_unlock();
	}

	if (put_user(sizeof(*head), len_ptr))
		return -EFAULT;
	return put_user(head, head_ptr);

err_unlock:
	rcu_read_unlock();

	return ret;
}

/*
 * Process a futex-list entry, check whether it's owned by the
 * dying task, and do notification if so:
 */
int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
{
	u32 uval, nval, mval;

retry:
	if (get_user(uval, uaddr))
		return -1;

	if ((uval & FUTEX_TID_MASK) == curr->pid) {
		/*
		 * Ok, this dying thread is truly holding a futex
		 * of interest. Set the OWNER_DIED bit atomically
		 * via cmpxchg, and if the value had FUTEX_WAITERS
		 * set, wake up a waiter (if any). (We have to do a
		 * futex_wake() even if OWNER_DIED is already set -
		 * to handle the rare but possible case of recursive
		 * thread-death.) The rest of the cleanup is done in
		 * userspace.
		 */
		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
		/* Also keep the FUTEX_WAITER_REQUEUED flag if set */
		mval |= (uval & FUTEX_WAITER_REQUEUED);
		nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);

		if (nval == -EFAULT)
			return -1;

		if (nval != uval)
			goto retry;

		/*
		 * Wake robust non-PI futexes here. The wakeup of
		 * PI futexes happens in exit_pi_state():
		 */
		if (!pi) {
			if (uval & FUTEX_WAITERS)
				futex_wake(uaddr, &curr->mm->mmap_sem, 1);
		}
	}
	return 0;
}

/*
 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
 */
static inline int fetch_robust_entry(struct robust_list __user **entry,
				     struct robust_list __user * __user *head,
				     int *pi)
{
	unsigned long uentry;

	if (get_user(uentry, (unsigned long __user *)head))
		return -EFAULT;

	*entry = (void __user *)(uentry & ~1UL);
	*pi = uentry & 1;

	return 0;
}

/*
 * Walk curr->robust_list (very carefully, it's a userspace list!)
 * and mark any locks found there dead, and notify any waiters.
 *
 * We silently return on any sign of list-walking problem.
 */
void exit_robust_list(struct task_struct *curr)
{
	struct robust_list_head __user *head = curr->robust_list;
	struct robust_list __user *entry, *pending;
	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
	unsigned long futex_offset;

	/*
	 * Fetch the list head (which was registered earlier, via
	 * sys_set_robust_list()):
	 */
	if (fetch_robust_entry(&entry, &head->list.next, &pi))
		return;
	/*
	 * Fetch the relative futex offset:
	 */
	if (get_user(futex_offset, &head->futex_offset))
		return;
	/*
	 * Fetch any possibly pending lock-add first, and handle it
	 * if it exists:
	 */
	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
		return;

	if (pending)
		handle_futex_death((void __user *)pending + futex_offset,
				   curr, pip);

	while (entry != &head->list) {
		/*
		 * A pending lock might already be on the list, so
		 * don't process it twice:
		 */
		if (entry != pending)
			if (handle_futex_death((void __user *)entry + futex_offset,
						curr, pi))
				return;
		/*
		 * Fetch the next entry in the list:
		 */
		if (fetch_robust_entry(&entry, &entry->next, &pi))
			return;
		/*
		 * Avoid excessively long or circular lists:
		 */
		if (!--limit)
			break;

		cond_resched();
	}
}

long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
		u32 __user *uaddr2, u32 val2, u32 val3)
{
	int ret;
	int cmd = op & FUTEX_CMD_MASK;
	struct rw_semaphore *fshared = NULL;

	if (!(op & FUTEX_PRIVATE_FLAG))
		fshared = &current->mm->mmap_sem;

	switch (cmd) {
	case FUTEX_WAIT:
		ret = futex_wait(uaddr, fshared, val, timeout);
		break;
	case FUTEX_WAKE:
		ret = futex_wake(uaddr, fshared, val);
		break;
	case FUTEX_FD:
		/* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
		ret = futex_fd(uaddr, val);
		break;
	case FUTEX_REQUEUE:
		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
		break;
	case FUTEX_CMP_REQUEUE:
		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
		break;
	case FUTEX_WAKE_OP:
		ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
		break;
	case FUTEX_LOCK_PI:
		ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
		break;
	case FUTEX_UNLOCK_PI:
		ret = futex_unlock_pi(uaddr, fshared);
		break;
	case FUTEX_TRYLOCK_PI:
		ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
		break;
	case FUTEX_CMP_REQUEUE_PI:
		ret = futex_requeue_pi(uaddr, fshared, uaddr2, val, val2, &val3);
		break;
	default:
		ret = -ENOSYS;
	}
	return ret;
}


asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
			  struct timespec __user *utime, u32 __user *uaddr2,
			  u32 val3)
{
	struct timespec ts;
	ktime_t t, *tp = NULL;
	u32 val2 = 0;
	int cmd = op & FUTEX_CMD_MASK;

	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) {
		if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
			return -EFAULT;
		if (!timespec_valid(&ts))
			return -EINVAL;

		t = timespec_to_ktime(ts);
		if (cmd == FUTEX_WAIT)
			t = ktime_add(ktime_get(), t);
		tp = &t;
	}
	/*
	 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
	 */
	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE
	    || cmd == FUTEX_CMP_REQUEUE_PI)
		val2 = (u32) (unsigned long) utime;

	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}

static int futexfs_get_sb(struct file_system_type *fs_type,
			  int flags, const char *dev_name, void *data,
			  struct vfsmount *mnt)
{
	return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt);
}

static struct file_system_type futex_fs_type = {
	.name		= "futexfs",
	.get_sb		= futexfs_get_sb,
	.kill_sb	= kill_anon_super,
};

static int __init init(void)
{
	int i = register_filesystem(&futex_fs_type);

	if (i)
		return i;

	futex_mnt = kern_mount(&futex_fs_type);
	if (IS_ERR(futex_mnt)) {
		unregister_filesystem(&futex_fs_type);
		return PTR_ERR(futex_mnt);
	}

	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
		plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
		spin_lock_init(&futex_queues[i].lock);
	}
	return 0;
}
__initcall(init);