Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Include/object.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ whose size is determined when the object is allocated.
*/
typedef struct _object {
PyObject_HEAD
Py_ssize_t ob_bstate;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we want to put this only in strings/bytes, rather than in every object? Or is that not possible?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some methods use the abstract object type see: #39 , so it is necessary.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm unsure what it means to ask for the bstate of something that isn't a string. Maybe I'm missing something!

} PyObject;

typedef struct {
Expand Down
4 changes: 4 additions & 0 deletions Lib/test/test_StringIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ def test_py3k_string_cmp(self):
with test_support.check_py3k_warnings(*deprecations):
"test str" == u"test unicode"

def test_py3k_join(self):
with test_support.check_py3k_warnings(*deprecations):
"test str" + u"test unicode"

class TestcStringIO(TestGenericStringIO):
MODULE = cStringIO

Expand Down
5 changes: 5 additions & 0 deletions Lib/test/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -1020,6 +1020,11 @@ def test_codecs_idna(self):
# Test whether trailing dot is preserved
self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")

def test_3k_join(self):
with test_support.check_py3k_warnings():
u"test unicode" + "test str"


def test_codecs_errors(self):
# Error handling (encoding)
self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
Expand Down
16 changes: 15 additions & 1 deletion Objects/stringobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -1646,6 +1646,15 @@ string_join(PyStringObject *self, PyObject *orig)
* original sequence can be iterated over
* again, so we must pass seq here.
*/
if (!PyString_Check(item) && !item->ob_bstate == NULL) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I now realise is we probably need a Py_GetBState(...) function that returns (say) -1 for "not string/unicode" and then 0/1/2/3/whatever for the actual bstate. That way we can avoid putting bstate on every object.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Today I started to add Py_GetBState, I broke other tests that I am still fixing.

if (PyErr_WarnPy3k("Concatenation only works for bytes to bytes in 3.x: convert the string to bytes.", 1) < 0) {
return NULL;
}
}

if (!PyString_Check(orig) && item->ob_bstate == NULL) {
item->ob_bstate = BSTATE_BYTE;
}
PyObject *result;
result = PyUnicode_Join((PyObject *)self, seq);
Py_DECREF(seq);
Expand Down Expand Up @@ -1700,6 +1709,7 @@ _PyString_Join(PyObject *sep, PyObject *x)
{
assert(sep != NULL && PyString_Check(sep));
assert(x != NULL);

return string_join((PyStringObject *)sep, x);
}

Expand Down Expand Up @@ -3719,16 +3729,20 @@ str_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
static PyObject *
string_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{

PyObject *x = NULL;
static char *kwlist[] = {"object", 0};
PyStringObject *tmp = NULL;

if (type != &PyString_Type)
return str_subtype_new(type, args, kwds);
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O:str", kwlist, &x))
return NULL;
if (x == NULL)
return PyString_FromString("");
return PyObject_Str(x);
tmp = PyObject_Str(x);
tmp->ob_bstate = BSTATE_BYTE;
return tmp;
}

static PyObject *
Expand Down
36 changes: 31 additions & 5 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -5673,6 +5673,16 @@ PyUnicode_Join(PyObject *separator, PyObject *seq)
PyObject *item;
Py_ssize_t i;

if (!PyUnicode_Check(seq) && !seq->ob_bstate == NULL) {
if (PyErr_WarnPy3k("Concatenation only works for unicode to unicode in 3.x: convert the string to unicode.", 1) < 0) {
return NULL;
}
}

if (!PyUnicode_Check(seq) && seq->ob_bstate == NULL) {
seq->ob_bstate = BSTATE_UNICODE;
}

fseq = PySequence_Fast(seq, "can only join an iterable");
if (fseq == NULL) {
return NULL;
Expand All @@ -5694,7 +5704,16 @@ PyUnicode_Join(PyObject *separator, PyObject *seq)
/* If singleton sequence with an exact Unicode, return that. */
if (seqlen == 1) {
item = PySequence_Fast_GET_ITEM(fseq, 0);
if (PyUnicode_CheckExact(item)) {
if (!PyUnicode_CheckExact(item) && !item->ob_bstate == NULL) {
if (PyErr_WarnPy3k("Concatenation only works for unicode to unicode in 3.x: convert the string to unicode.", 1) < 0) {
return NULL;
}
}

if (!PyUnicode_CheckExact(item) && item->ob_bstate == NULL) {
item->ob_bstate = BSTATE_UNICODE;
}
if (PyUnicode_CheckExact(item)) {
Py_INCREF(item);
res = (PyUnicodeObject *)item;
goto Done;
Expand Down Expand Up @@ -8851,6 +8870,7 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
static char *kwlist[] = {"string", "encoding", "errors", 0};
char *encoding = NULL;
char *errors = NULL;
PyUnicodeObject *tmp = NULL;

if (type != &PyUnicode_Type)
return unicode_subtype_new(type, args, kwds);
Expand All @@ -8859,10 +8879,16 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
return NULL;
if (x == NULL)
return (PyObject *)_PyUnicode_New(0);
if (encoding == NULL && errors == NULL)
return PyObject_Unicode(x);
else
return PyUnicode_FromEncodedObject(x, encoding, errors);
if (encoding == NULL && errors == NULL) {
tmp = PyObject_Unicode(x);
tmp->ob_bstate = BSTATE_UNICODE;
return tmp;
}
else {
tmp = PyUnicode_FromEncodedObject(x, encoding, errors);
tmp->ob_bstate = BSTATE_UNICODE;
return tmp;
}
}

static PyObject *
Expand Down