Implement IDNA (Internationalized Domain Names in Applications).

2548c730 · Martin v. Löwis · 8d17a90b · 2548c730 · 2548c730 · 2548c730
Kaydet (Commit) 2548c730 authored Nis 18, 2003 tarafından Martin v. Löwis
12 changed files
--- a/Doc/lib/lib.tex
+++ b/Doc/lib/lib.tex
@@ -112,6 +112,7 @@ and how to embed it in other applications.
 \input{libtextwrap}
 \input{libcodecs}
 \input{libunicodedata}
+\input{libstringprep}

 \input{libmisc}                 % Miscellaneous Services
 \input{libpydoc}

--- a/Doc/lib/libcodecs.tex
+++ b/Doc/lib/libcodecs.tex
@@ -5,7 +5,7 @@
 \modulesynopsis{Encode and decode data and streams.}
 \moduleauthor{Marc-Andre Lemburg}{mal@lemburg.com}
 \sectionauthor{Marc-Andre Lemburg}{mal@lemburg.com}
-
+\sectionauthor{Martin v. L\"owis}{martin@v.loewis.de}

 \index{Unicode}
 \index{Codecs}
@@ -809,6 +809,11 @@ listed as operand type in the table.
         {byte string}
         {Convert operand to hexadecimal representation, with two digits per byte}

+\lineiv{idna}
+         {}
+         {Unicode string}
+         {Implements \rfc{3490}. \versionadded{2.3}. See also \module{encodings.idna}}
+
 \lineiv{mbcs}
         {dbcs}
         {Unicode string}
@@ -819,6 +824,11 @@ listed as operand type in the table.
         {Unicode string}
         {Encoding of PalmOS 3.5}

+\lineiv{punycode}
+         {}
+         {Unicode string}
+         {Implements \rfc{3492}. \versionadded{2.3}}
+
 \lineiv{quopri_codec}
         {quopri, quoted-printable, quotedprintable}
         {byte string}
@@ -865,3 +875,63 @@ listed as operand type in the table.
         {Compress the operand using gzip}

 \end{tableiv}
+
+\subsection{\module{encodings.idna} ---
+            Internationalized Domain Names in Applications}
+
+\declaremodule{standard}{encodings.idna}
+\modulesynopsis{Internationalized Domain Names implementation}
+\moduleauthor{Martin v. L\"owis}
+
+This module implements \rfc{3490} (Internationalized Domain Names in
+Applications) and \rfc{3492} (Nameprep: A Stringprep Profile for
+Internationalized Domain Names (IDN)). It builds upon the
+\code{punycode} encoding and \module{stringprep}. \versionadded{2.3}
+
+These RFCs together define a protocol to support non-ASCII characters
+in domain names. A domain name containing non-ASCII characters (such
+as ``www.Alliancefran\,caise.nu'') is converted into an
+ASCII-compatible encoding (ACE, such as
+``www.xn--alliancefranaise-npb.nu''). The ACE form of the domain name
+is then used in all places where arbitrary characters are not allowed
+by the protocol, such as DNS queries, HTTP \code{Host:} fields, and so
+on. This conversion is carried out in the application; if possible
+invisible to the user: The application should transparently convert
+Unicode domain labels to IDNA on the wire, and convert back ACE labels
+to Unicode before presenting them to the user.
+
+Python supports this conversion in several ways: The \code{idna} codec
+allows to convert between Unicode and the ACE. Furthermore, the
+\module{socket} module transparently converts Unicode host names to
+ACE, so that applications need not be concerned about converting host
+names themselves when they pass them to the socket module. On top of
+that, modules that have host names as function parameters, such as
+\module{httplib} and \module{ftplib}, accept Unicode host names
+(\module{httplib} then also transparently sends an IDNA hostname in
+the \code{Host:} field if it sends that field at all). 
+
+When receiving host names from the wire (such as in reverse name
+lookup), no automatic conversion to Unicode is performed: Applications
+wishing to present such host names to the user should decode them to
+Unicode.
+
+The module \module{encodings.idna} also implements the nameprep
+procedure, which performs certain normalizations on host names, to
+achieve case-insensitivity of international domain names, and to unify
+similar characters. The nameprep functions can be used directly if
+desired.
+
+\begin{funcdesc}{nameprep}{label}
+Return the nameprepped version of \var{label}. The implementation
+currently assumes query strings, so \code{AllowUnassigned} is
+true.
+\end{funcdesc}
+
+\begin{funcdesc}{ToASCCII}{label}
+Convert a label to ASCII, as specified in \rfc{3490}.
+\code{UseSTD3ASCIIRules} is assumed to be false.
+\end{funcdesc}
+
+\begin{funcdesc}{ToUnicode}{label}
+Convert a label to Unicode, as specified in \rfc{3490}.
+\end{funcdesc}
--- a/Doc/lib/libstringprep.tex
+++ b/Doc/lib/libstringprep.tex
+\section{\module{stringprep} ---
+         Internet String Preparation}
+
+\declaremodule{standard}{stringprep}
+\modulesynopsis{String preparation, as per RFC 3453}
+\moduleauthor{Martin v. L\"owis}{martin@v.loewis.de}
+\sectionauthor{Martin v. L\"owis}{martin@v.loewis.de}
+
+When identifying things (such as host names) in the internet, it is
+often necessary to compare such identifications for
+``equality''. Exactly how this comparison is executed may depend on
+the application domain, e.g. whether it should be case-insensitive or
+not. It may be also necessary to restrict the possible
+identifications, to allow only identifications consisting of
+``printable'' characters.
+
+\rfc{3454} defines a procedure for ``preparing'' Unicode strings in
+internet protocols. Before passing strings onto the wire, they are
+processed with the preparation procedure, after which they have a
+certain normalized form. The RFC defines a set of tables, which can be
+combined into profiles. Each profile must define which tables it uses,
+and what other optional parts of the \code{stringprep} procedure are
+part of the profile. One example of a \code{stringprep} profile is
+\code{nameprep}, which is used for internationalized domain names.
+
+The module \module{stringprep} only exposes the tables from RFC
+3454. As these tables would be very large to represent them as
+dictionaries or lists, the module uses the Unicode character database
+internally. The module source code itself was generated using the
+\code{mkstringprep.py} utility.
+
+As a result, these tables are exposed as functions, not as data
+structures. There are two kinds of tables in the RFC: sets and
+mappings. For a set, \module{stringprep} provides the ``characteristic
+function'', i.e. a function that returns true if the parameter is part
+of the set. For mappings, it provides the mapping function: given the
+key, it returns the associated value. Below is a list of all functions
+available in the module.
+
+\begin{funcdesc}{in_table_a1}{code}
+Determine whether \var{code} is in table{A.1} (Unassigned code points
+in Unicode 3.2).
+\end{funcdesc}
+
+\begin{funcdesc}{in_table_b1}{code}
+Determine whether \var{code} is in table{B.1} (Commonly mapped to
+nothing).
+\end{funcdesc}
+
+\begin{funcdesc}{map_table_b2}{code}
+Return the mapped value for \var{code} according to table{B.2} 
+(Mapping for case-folding used with NFKC).
+\end{funcdesc}
+
+\begin{funcdesc}{map_table_b3}{code}
+Return the mapped value for \var{code} according to table{B.3} 
+(Mapping for case-folding used with no normalization).
+\end{funcdesc}
+
+\begin{funcdesc}{in_table_c11}{code}
+Determine whether \var{code} is in table{C.1.1} 
+(ASCII space characters).
+\end{funcdesc}
+
+\begin{funcdesc}{in_table_c12}{code}
+Determine whether \var{code} is in table{C.1.2} 
+(Non-ASCII space characters).
+\end{funcdesc}
+
+\begin{funcdesc}{in_table_c11_c12}{code}
+Determine whether \var{code} is in table{C.1} 
+(Space characters, union of C.1.1 and C.1.2).
+\end{funcdesc}
+
+\begin{funcdesc}{in_table_c21}{code}
+Determine whether \var{code} is in table{C.2.1} 
+(ASCII control characters).
+\end{funcdesc}
+
+\begin{funcdesc}{in_table_c22}{code}
+Determine whether \var{code} is in table{C.2.2} 
+(Non-ASCII control characters).
+\end{funcdesc}
+
+\begin{funcdesc}{in_table_c21_c22}{code}
+Determine whether \var{code} is in table{C.2} 
+(Control characters, union of C.2.1 and C.2.2).
+\end{funcdesc}
+
+\begin{funcdesc}{in_table_c3}{code}
+Determine whether \var{code} is in table{C.3} 
+(Private use).
+\end{funcdesc}
+
+\begin{funcdesc}{in_table_c4}{code}
+Determine whether \var{code} is in table{C.4} 
+(Non-character code points).
+\end{funcdesc}
+
+\begin{funcdesc}{in_table_c5}{code}
+Determine whether \var{code} is in table{C.5} 
+(Surrogate codes).
+\end{funcdesc}
+
+\begin{funcdesc}{in_table_c6}{code}
+Determine whether \var{code} is in table{C.6} 
+(Inappropriate for plain text).
+\end{funcdesc}
+
+\begin{funcdesc}{in_table_c7}{code}
+Determine whether \var{code} is in table{C.7} 
+(Inappropriate for canonical representation).
+\end{funcdesc}
+
+\begin{funcdesc}{in_table_c8}{code}
+Determine whether \var{code} is in table{C.8} 
+(Change display properties or are deprecated).
+\end{funcdesc}
+
+\begin{funcdesc}{in_table_c9}{code}
+Determine whether \var{code} is in table{C.9} 
+(Tagging characters).
+\end{funcdesc}
+
+\begin{funcdesc}{in_table_d1}{code}
+Determine whether \var{code} is in table{D.1} 
+(Characters with bidirectional property ``R'' or ``AL'').
+\end{funcdesc}
+
+\begin{funcdesc}{in_table_d2}{code}
+Determine whether \var{code} is in table{D.2} 
+(Characters with bidirectional property ``L'').
+\end{funcdesc}
+
--- a/Doc/whatsnew/whatsnew23.tex
+++ b/Doc/whatsnew/whatsnew23.tex
@@ -1791,6 +1791,27 @@ Tkinter.wantobjects = 0

 Any breakage caused by this change should be reported as a bug.

+\item Support for internationalized domain names (RFCs 3454, 3490,
+3491, and 3492) has been added. The ``idna'' encoding can be used
+to convert between a Unicode domain name and the ASCII-compatible
+encoding (ACE).
+
+\begin{verbatim}
+>>> u"www.Alliancefran\,caise.nu".encode("idna")
+'www.xn--alliancefranaise-npb.nu'
+\end{verbatim}
+
+In addition, the \module{socket} has been extended to transparently
+convert Unicode hostnames to the ACE before passing them to the C
+library. In turn, modules that pass hostnames ``through'' (such as
+\module{httplib}, \module{ftplib}) also support Unicode host names
+(httplib also sends ACE Host: headers). \module{urllib} supports
+Unicode URLs with non-ASCII host names as long as the \code{path} part
+of the URL is ASCII only.
+
+To implement this change, the module \module{stringprep}, the tool
+\code{mkstringprep} and the \code{punycode} encoding have been added.
+
 \end{itemize}



--- a/Lib/encodings/idna.py
+++ b/Lib/encodings/idna.py
+# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
+
+import stringprep, unicodedata, re, codecs
+
+# IDNA section 3.1
+dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
+
+# IDNA section 5
+ace_prefix = "xn--"
+uace_prefix = unicode(ace_prefix, "ascii")
+
+# This assumes query strings, so AllowUnassigned is true
+def nameprep(label):
+    # Map
+    newlabel = []
+    for c in label:
+        if stringprep.in_table_b1(c):
+            # Map to nothing
+            continue
+        newlabel.append(stringprep.map_table_b2(c))
+    label = u"".join(newlabel)
+    
+    # Normalize
+    label = unicodedata.normalize("NFKC", label)
+    
+    # Prohibit
+    for c in label:
+        if stringprep.in_table_c12(c) or \
+           stringprep.in_table_c22(c) or \
+           stringprep.in_table_c3(c) or \
+           stringprep.in_table_c4(c) or \
+           stringprep.in_table_c5(c) or \
+           stringprep.in_table_c6(c) or \
+           stringprep.in_table_c7(c) or \
+           stringprep.in_table_c8(c) or \
+           stringprep.in_table_c9(c):
+            raise UnicodeError, "Invalid character %s" % repr(c)
+
+    # Check bidi
+    RandAL = map(stringprep.in_table_d1, label)
+    for c in RandAL:
+        if c:
+            # There is a RandAL char in the string. Must perform further
+            # tests:
+            # 1) The characters in section 5.8 MUST be prohibited.
+            # This is table C.8, which was already checked
+            # 2) If a string contains any RandALCat character, the string
+            # MUST NOT contain any LCat character.
+            if filter(stringprep.in_table_d2, label):
+                raise UnicodeError, "Violation of BIDI requirement 2"
+
+            # 3) If a string contains any RandALCat character, a
+            # RandALCat character MUST be the first character of the
+            # string, and a RandALCat character MUST be the last
+            # character of the string.
+            if not RandAL[0] or not RandAL[-1]:
+                raise UnicodeError, "Violation of BIDI requirement 3"
+
+    return label
+
+def ToASCII(label):
+    try:
+        # Step 1: try ASCII
+        label = label.encode("ascii")
+    except UnicodeError:
+        pass
+    else:
+        # Skip to step 3: UseSTD3ASCIIRules is false, so
+        # Skip to step 8.
+        if 0 < len(label) < 64:
+            return label
+        raise UnicodeError, "label too long"
+
+    # Step 2: nameprep
+    label = nameprep(label)
+
+    # Step 3: UseSTD3ASCIIRules is false
+    # Step 4: try ASCII
+    try:
+        label = label.encode("ascii")
+    except UnicodeError:
+        pass
+    else:
+        # Skip to step 8.
+        if 0 < len(label) < 64:
+            return label
+        raise UnicodeError, "label too long"
+
+    # Step 5: Check ACE prefix
+    if label.startswith(uace_prefix):
+        raise UnicodeError, "Label starts with ACE prefix"
+
+    # Step 6: Encode with PUNYCODE
+    label = label.encode("punycode")
+
+    # Step 7: Prepend ACE prefix
+    label = ace_prefix + label
+
+    # Step 8: Check size
+    if 0 < len(label) < 64:
+        return label
+    raise UnicodeError, "label too long"
+
+def ToUnicode(label):
+    # Step 1: Check for ASCII
+    if isinstance(label, str):
+        pure_ascii = True
+    else:
+        try:
+            label = label.encode("ascii")
+            pure_ascii = True
+        except UnicodeError:
+            pure_ascii = False
+    if not pure_ascii:
+        # Step 2: Perform nameprep
+        label = nameprep(label)
+        # It doesn't say this, but apparently, it should be ASCII now
+        try:
+            label = label.encode("ascii")
+        except UnicodeError:
+            raise UnicodeError, "Invalid character in IDN label"
+    # Step 3: Check for ACE prefix
+    if not label.startswith(ace_prefix):
+        return unicode(label, "ascii")
+
+    # Step 4: Remove ACE prefix
+    label1 = label[len(ace_prefix):]
+
+    # Step 5: Decode using PUNYCODE
+    result = label1.decode("punycode")
+
+    # Step 6: Apply ToASCII
+    label2 = ToASCII(result)
+
+    # Step 7: Compare the result of step 6 with the one of step 3
+    # label2 will already be in lower case.
+    if label.lower() != label2:
+        raise UnicodeError, ("IDNA does not round-trip", label, label2)
+
+    # Step 8: return the result of step 5
+    return result
+        
+### Codec APIs
+
+class Codec(codecs.Codec):
+    def encode(self,input,errors='strict'):
+
+        if errors != 'strict':
+            # IDNA is quite clear that implementations must be strict
+            raise UnicodeError, "unsupported error handling "+errors
+
+        result = []
+        for label in dots.split(input):
+            result.append(ToASCII(label))
+        # Join with U+002E
+        return ".".join(result), len(input)
+
+    def decode(self,input,errors='strict'):
+        
+        if errors != 'strict':
+            raise UnicodeError, "Unsupported error handling "+errors
+
+        # IDNA allows decoding to operate on Unicode strings, too.
+        if isinstance(input, unicode):
+            labels = dots.split(input)
+        else:
+            # Must be ASCII string
+            unicode(input, "ascii")
+            labels = input.split(".")
+
+        result = []
+        for label in labels:
+            result.append(ToUnicode(label))
+
+        return u".".join(result), len(input)
+
+class StreamWriter(Codec,codecs.StreamWriter):
+    pass
+
+class StreamReader(Codec,codecs.StreamReader):
+    pass
+
+### encodings module API
+
+def getregentry():
+
+    return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
--- a/Lib/encodings/punycode.py
+++ b/Lib/encodings/punycode.py
+# -*- coding: iso-8859-1 -*-
+""" Codec for the Punicode encoding, as specified in RFC 3492
+
+Written by Martin v. Lwis.
+"""
+
+import codecs
+
+##################### Encoding #####################################
+
+def segregate(str):
+    """3.1 Basic code point segregation""" 
+    base = []
+    extended = {}
+    for c in str:
+        if ord(c) < 128:
+            base.append(c)
+        else:
+            extended[c] = 1
+    extended = extended.keys()
+    extended.sort()
+    return "".join(base).encode("ascii"),extended
+
+def selective_len(str, max):
+    """Return the length of str, considering only characters below max."""
+    res = 0
+    for c in str:
+        if ord(c) < max:
+            res += 1
+    return res
+
+def selective_find(str, char, index, pos):
+    """Return a pair (index, pos), indicating the next occurrence of
+    char in str. index is the position of the character considering
+    only ordinals up to and including char, and pos is the position in
+    the full string. index/pos is the starting position in the full
+    string."""
+
+    l = len(str)
+    while 1:
+        pos += 1
+        if pos == l:
+            return (-1, -1)
+        c = str[pos]
+        if c == char:
+            return index+1, pos
+        elif c < char:
+            index += 1
+
+def insertion_unsort(str, extended):
+    """3.2 Insertion unsort coding"""
+    oldchar = 0x80
+    result = []
+    oldindex = -1
+    for c in extended:
+        index = pos = -1
+        char = ord(c)
+        curlen = selective_len(str, char)
+        delta = (curlen+1) * (char - oldchar)
+        while 1:
+            index,pos = selective_find(str,c,index,pos)
+            if index == -1:
+                break
+            delta += index - oldindex
+            result.append(delta-1)
+            oldindex = index
+            delta = 0
+        oldchar = char
+            
+    return result
+
+def T(j, bias):
+    # Punycode parameters: tmin = 1, tmax = 26, base = 36
+    res = 36 * (j + 1) - bias
+    if res < 1: return 1
+    if res > 26: return 26
+    return res
+
+digits = "abcdefghijklmnopqrstuvwxyz0123456789"
+def generate_generalized_integer(N, bias):
+    """3.3 Generalized variable-length integers"""
+    result = []
+    j = 0
+    while 1:
+        t = T(j, bias)
+        if N < t:
+            result.append(digits[N])
+            return result
+        result.append(digits[t + ((N - t) % (36 - t))])
+        N = (N - t) // (36 - t)
+        j += 1
+
+def adapt(delta, first, numchars):
+    if first:
+        delta //= 700
+    else:
+        delta //= 2
+    delta += delta // numchars
+    # ((base - tmin) * tmax) // 2 == 455
+    divisions = 0
+    while delta > 455:
+        delta = delta // 35 # base - tmin
+        divisions += 36
+    bias = divisions + (36 * delta // (delta + 38))
+    return bias
+    
+
+def generate_integers(baselen, deltas):
+    """3.4 Bias adaptation"""
+    # Punycode parameters: initial bias = 72, damp = 700, skew = 38
+    result = []
+    bias = 72
+    for points, delta in enumerate(deltas):
+        s = generate_generalized_integer(delta, bias)
+        result.extend(s)
+        bias = adapt(delta, points==0, baselen+points+1)
+    return "".join(result)
+
+def punycode_encode(text):
+    base, extended = segregate(text)
+    base = base.encode("ascii")
+    deltas = insertion_unsort(text, extended)
+    extended = generate_integers(len(base), deltas)
+    if base:
+        return base + "-" + extended
+    return extended
+
+##################### Decoding #####################################
+
+def decode_generalized_number(extended, extpos, bias, errors):
+    """3.3 Generalized variable-length integers"""
+    result = 0
+    w = 1
+    j = 0
+    while 1:
+        try:
+            char = ord(extended[extpos])
+        except IndexError:
+            if errors == "strict":
+                raise UnicodeError, "incomplete punicode string"
+            return extpos + 1, None
+        extpos += 1
+        if 0x41 <= char <= 0x5A: # A-Z
+            digit = char - 0x41
+        elif 0x30 <= char <= 0x39:
+            digit = char - 22 # 0x30-26
+        elif errors == "strict":
+            raise UnicodeError("Invalid extended code point '%s'"
+                               % extended[extpos])
+        else:
+            return extpos, None
+        t = T(j, bias)
+        result += digit * w
+        if digit < t:
+            return extpos, result
+        w = w * (36 - t)
+        j += 1
+        
+
+def insertion_sort(base, extended, errors):
+    """3.2 Insertion unsort coding"""
+    char = 0x80
+    pos = -1
+    bias = 72
+    extpos = 0
+    while extpos < len(extended):
+        newpos, delta = decode_generalized_number(extended, extpos,
+                                                  bias, errors)
+        if delta is None:
+            # There was an error in decoding. We can't continue because
+            # synchronization is lost.
+            return base
+        pos += delta+1
+        char += pos // (len(base) + 1)
+        if char > 0x10FFFF:
+            if errors == "strict":
+                raise UnicodeError, ("Invalid character U+%x" % char)
+            char = ord('?')
+        pos = pos % (len(base) + 1)
+        base = base[:pos] + unichr(char) + base[pos:]
+        bias = adapt(delta, (extpos == 0), len(base))
+        extpos = newpos
+    return base
+
+def punycode_decode(text, errors):
+    pos = text.rfind("-")
+    if pos == -1:
+        base = ""
+        extended = text
+    else:
+        base = text[:pos]
+        extended = text[pos+1:]
+    base = unicode(base, "ascii", errors)
+    extended = extended.upper()
+    return insertion_sort(base, extended, errors)
+        
+### Codec APIs
+
+class Codec(codecs.Codec):
+    def encode(self,input,errors='strict'):
+
+        res = punycode_encode(input)
+        return res, len(input)
+
+    def decode(self,input,errors='strict'):
+
+        if errors not in ('strict', 'replace', 'ignore'):
+            raise UnicodeError, "Unsupported error handling "+errors
+        res = punycode_decode(input, errors)
+        return res, len(input)
+
+class StreamWriter(Codec,codecs.StreamWriter):
+    pass
+
+class StreamReader(Codec,codecs.StreamReader):
+    pass
+
+### encodings module API
+
+def getregentry():
+
+    return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
--- a/Lib/httplib.py
+++ b/Lib/httplib.py
@@ -655,11 +655,11 @@ class HTTPConnection:
                    nil, netloc, nil, nil, nil = urlsplit(url)

                if netloc:
-                    self.putheader('Host', netloc)
+                    self.putheader('Host', netloc.encode("idna"))
                elif self.port == HTTP_PORT:
-                    self.putheader('Host', self.host)
+                    self.putheader('Host', self.host.encode("idna"))
                else:
-                    self.putheader('Host', "%s:%s" % (self.host, self.port))
+                    self.putheader('Host', "%s:%s" % (self.host.encode("idna"), self.port))

            # note: we are assuming that clients will not attempt to set these
            #       headers since *this* library must deal with the

--- a/Lib/stringprep.py
+++ b/Lib/stringprep.py
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -118,6 +118,11 @@ Extension modules
 Library
 -------

+- Support for internationalized domain names has been added through
+  the 'idna' and 'punycode' encodings, the 'stringprep' module, the
+  'mkstringprep' tool, and enhancements to the socket and httplib
+  modules.
+
 - htmlentitydefs has two new dictionaries: name2codepoint maps
  HTML entity names to Unicode codepoints (as integers).
  codepoint2name is the reverse mapping. See SF patch #722017.

--- a/Modules/socketmodule.c
+++ b/Modules/socketmodule.c
@@ -874,7 +874,8 @@ getsockaddrarg(PySocketSockObject *s, PyObject *args,
 				args->ob_type->tp_name);
 			return 0;
 		}
-		if (!PyArg_ParseTuple(args, "si:getsockaddrarg", &host, &port))
+		if (!PyArg_ParseTuple(args, "eti:getsockaddrarg", 
+				      "idna", &host, &port))
 			return 0;
 		if (setipaddr(host, (struct sockaddr *)addr, sizeof(*addr),  AF_INET) < 0)
 			return 0;
@@ -893,7 +894,8 @@ getsockaddrarg(PySocketSockObject *s, PyObject *args,
 		int port, flowinfo, scope_id;
 		addr = (struct sockaddr_in6*)&(s->sock_addr).in6;
 		flowinfo = scope_id = 0;
-		if (!PyArg_ParseTuple(args, "si|ii", &host, &port, &flowinfo,
+		if (!PyArg_ParseTuple(args, "eti|ii", 
+				      "idna", &host, &port, &flowinfo,
 				      &scope_id)) {
 			return 0;
 		}
@@ -2782,6 +2784,7 @@ socket_getaddrinfo(PyObject *self, PyObject *args)
 {
 	struct addrinfo hints, *res;
 	struct addrinfo *res0 = NULL;
+	PyObject *hobj = NULL;
 	PyObject *pobj = (PyObject *)NULL;
 	char pbuf[30];
 	char *hptr, *pptr;
@@ -2789,12 +2792,27 @@ socket_getaddrinfo(PyObject *self, PyObject *args)
 	int error;
 	PyObject *all = (PyObject *)NULL;
 	PyObject *single = (PyObject *)NULL;
+	PyObject *idna = NULL;

 	family = socktype = protocol = flags = 0;
 	family = AF_UNSPEC;
-	if (!PyArg_ParseTuple(args, "zO|iiii:getaddrinfo",
-	    &hptr, &pobj, &family, &socktype,
-			&protocol, &flags)) {
+	if (!PyArg_ParseTuple(args, "OO|iiii:getaddrinfo",
+			      &hobj, &pobj, &family, &socktype,
+			      &protocol, &flags)) {
+		return NULL;
+	}
+	if (hobj == Py_None) {
+		hptr = NULL;
+	} else if (PyUnicode_Check(hobj)) {
+		idna = PyObject_CallMethod(hobj, "encode", "s", "idna");
+		if (!idna)
+			return NULL;
+		hptr = PyString_AsString(idna);
+	} else if (PyString_Check(hobj)) {
+		hptr = PyString_AsString(hobj);
+	} else {
+		PyErr_SetString(PyExc_TypeError, 
+				"getaddrinfo() argument 1 must be string or None");
 		return NULL;
 	}
 	if (PyInt_Check(pobj)) {
@@ -2838,12 +2856,14 @@ socket_getaddrinfo(PyObject *self, PyObject *args)
 			goto err;
 		Py_XDECREF(single);
 	}
+	Py_XDECREF(idna);
 	if (res0)
 		freeaddrinfo(res0);
 	return all;
 err:
 	Py_XDECREF(single);
 	Py_XDECREF(all);
+	Py_XDECREF(idna);
 	if (res0)
 		freeaddrinfo(res0);
 	return (PyObject *)NULL;

--- a/Tools/unicode/mkstringprep.py
+++ b/Tools/unicode/mkstringprep.py