diff --git a/Include/object.h b/Include/object.h index 807b24188a75b2..3fd52bc966e54f 100644 --- a/Include/object.h +++ b/Include/object.h @@ -105,6 +105,7 @@ whose size is determined when the object is allocated. */ typedef struct _object { PyObject_HEAD + Py_ssize_t ob_bstate; } PyObject; typedef struct { diff --git a/Lib/test/test_StringIO.py b/Lib/test/test_StringIO.py index e855462dda8849..9bc7e0f6a7f104 100644 --- a/Lib/test/test_StringIO.py +++ b/Lib/test/test_StringIO.py @@ -183,6 +183,10 @@ def test_py3k_string_cmp(self): with test_support.check_py3k_warnings(*deprecations): "test str" == u"test unicode" + def test_py3k_join(self): + with test_support.check_py3k_warnings(*deprecations): + "test str" + u"test unicode" + class TestcStringIO(TestGenericStringIO): MODULE = cStringIO diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 92476f68a53c11..4ce10e0722b31f 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1020,6 +1020,11 @@ def test_codecs_idna(self): # Test whether trailing dot is preserved self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.") + def test_3k_join(self): + with test_support.check_py3k_warnings(): + u"test unicode" + "test str" + + def test_codecs_errors(self): # Error handling (encoding) self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii') diff --git a/Objects/stringobject.c b/Objects/stringobject.c index d4b536565f760c..db4512740b05ab 100644 --- a/Objects/stringobject.c +++ b/Objects/stringobject.c @@ -1646,6 +1646,15 @@ string_join(PyStringObject *self, PyObject *orig) * original sequence can be iterated over * again, so we must pass seq here. */ + if (!PyString_Check(item) && !item->ob_bstate == NULL) { + if (PyErr_WarnPy3k("Concatenation only works for bytes to bytes in 3.x: convert the string to bytes.", 1) < 0) { + return NULL; + } + } + + if (!PyString_Check(orig) && item->ob_bstate == NULL) { + item->ob_bstate = BSTATE_BYTE; + } PyObject *result; result = PyUnicode_Join((PyObject *)self, seq); Py_DECREF(seq); @@ -1700,6 +1709,7 @@ _PyString_Join(PyObject *sep, PyObject *x) { assert(sep != NULL && PyString_Check(sep)); assert(x != NULL); + return string_join((PyStringObject *)sep, x); } @@ -3719,8 +3729,10 @@ str_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); static PyObject * string_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { + PyObject *x = NULL; static char *kwlist[] = {"object", 0}; + PyStringObject *tmp = NULL; if (type != &PyString_Type) return str_subtype_new(type, args, kwds); @@ -3728,7 +3740,9 @@ string_new(PyTypeObject *type, PyObject *args, PyObject *kwds) return NULL; if (x == NULL) return PyString_FromString(""); - return PyObject_Str(x); + tmp = PyObject_Str(x); + tmp->ob_bstate = BSTATE_BYTE; + return tmp; } static PyObject * diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index c00f7f365555d3..66f711c78b9f34 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5673,6 +5673,16 @@ PyUnicode_Join(PyObject *separator, PyObject *seq) PyObject *item; Py_ssize_t i; + if (!PyUnicode_Check(seq) && !seq->ob_bstate == NULL) { + if (PyErr_WarnPy3k("Concatenation only works for unicode to unicode in 3.x: convert the string to unicode.", 1) < 0) { + return NULL; + } + } + + if (!PyUnicode_Check(seq) && seq->ob_bstate == NULL) { + seq->ob_bstate = BSTATE_UNICODE; + } + fseq = PySequence_Fast(seq, "can only join an iterable"); if (fseq == NULL) { return NULL; @@ -5694,7 +5704,16 @@ PyUnicode_Join(PyObject *separator, PyObject *seq) /* If singleton sequence with an exact Unicode, return that. */ if (seqlen == 1) { item = PySequence_Fast_GET_ITEM(fseq, 0); - if (PyUnicode_CheckExact(item)) { + if (!PyUnicode_CheckExact(item) && !item->ob_bstate == NULL) { + if (PyErr_WarnPy3k("Concatenation only works for unicode to unicode in 3.x: convert the string to unicode.", 1) < 0) { + return NULL; + } + } + + if (!PyUnicode_CheckExact(item) && item->ob_bstate == NULL) { + item->ob_bstate = BSTATE_UNICODE; + } + if (PyUnicode_CheckExact(item)) { Py_INCREF(item); res = (PyUnicodeObject *)item; goto Done; @@ -8851,6 +8870,7 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) static char *kwlist[] = {"string", "encoding", "errors", 0}; char *encoding = NULL; char *errors = NULL; + PyUnicodeObject *tmp = NULL; if (type != &PyUnicode_Type) return unicode_subtype_new(type, args, kwds); @@ -8859,10 +8879,16 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) return NULL; if (x == NULL) return (PyObject *)_PyUnicode_New(0); - if (encoding == NULL && errors == NULL) - return PyObject_Unicode(x); - else - return PyUnicode_FromEncodedObject(x, encoding, errors); + if (encoding == NULL && errors == NULL) { + tmp = PyObject_Unicode(x); + tmp->ob_bstate = BSTATE_UNICODE; + return tmp; + } + else { + tmp = PyUnicode_FromEncodedObject(x, encoding, errors); + tmp->ob_bstate = BSTATE_UNICODE; + return tmp; + } } static PyObject *