-
Notifications
You must be signed in to change notification settings - Fork 7.9k
Add SORT_STRICT option to array_unique() #7806
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -103,6 +103,7 @@ PHP_MINIT_FUNCTION(array) /* {{{ */ | |
|
||
REGISTER_LONG_CONSTANT("SORT_REGULAR", PHP_SORT_REGULAR, CONST_CS | CONST_PERSISTENT); | ||
REGISTER_LONG_CONSTANT("SORT_NUMERIC", PHP_SORT_NUMERIC, CONST_CS | CONST_PERSISTENT); | ||
REGISTER_LONG_CONSTANT("SORT_STRICT", PHP_SORT_STRICT, CONST_CS | CONST_PERSISTENT); | ||
REGISTER_LONG_CONSTANT("SORT_STRING", PHP_SORT_STRING, CONST_CS | CONST_PERSISTENT); | ||
REGISTER_LONG_CONSTANT("SORT_LOCALE_STRING", PHP_SORT_LOCALE_STRING, CONST_CS | CONST_PERSISTENT); | ||
REGISTER_LONG_CONSTANT("SORT_NATURAL", PHP_SORT_NATURAL, CONST_CS | CONST_PERSISTENT); | ||
|
@@ -349,6 +350,104 @@ static zend_always_inline int php_array_data_compare_unstable_i(Bucket *f, Bucke | |
} | ||
/* }}} */ | ||
|
||
/* return int to be compatible with compare_func_t */ | ||
static int hash_zval_strict_function(zval *z1, zval *z2) /* {{{ */ | ||
{ | ||
ZVAL_DEREF(z1); | ||
ZVAL_DEREF(z2); | ||
// If the types are different, compare based on type. | ||
// (Values of different types can't be identical.) | ||
int t1 = Z_TYPE_P(z1); | ||
int t2 = Z_TYPE_P(z2); | ||
if ( t1 != t2 ) { | ||
return (t1 > t2 ) ? 1 : -1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this is meant to be used for a sort exposed to end users, than having E.g. for https://github.com/TysonAndre/pecl-teds/blob/0.5.1/teds.c#L577-L618 handles some of the edge cases. Those edge cases are https://github.com/TysonAndre/pecl-teds/blob/0.5.1/tests/misc/stable_compare.phpt#L75-L86 |
||
} | ||
// The most important thing about this comparison mode is that the result | ||
// is 0 when zend_is_identical, and non-zero otherwise. This test is | ||
// done first to make it easier to verify this property by inspection. | ||
// (Arrays are excluded as an optimization, to avoid a redudant | ||
// deep inspection.) | ||
if (t1 != IS_ARRAY && zend_is_identical(z1, z2)) { | ||
return 0; | ||
} | ||
// Both types are the same *but the values are not identical* | ||
// Below this point, the return value for non-array values | ||
// should always be non-zero. | ||
switch (t1) { | ||
case IS_LONG: | ||
return Z_LVAL_P(z1) > Z_LVAL_P(z2) ? 1 : -1; | ||
|
||
case IS_DOUBLE: | ||
return Z_DVAL_P(z1) > Z_DVAL_P(z2) ? 1 : -1; | ||
|
||
case IS_STRING: | ||
return zend_binary_strcmp( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: zend_binary_zval_strcmp(z1, z2) does the same thing and is a bit shorter to read |
||
Z_STRVAL_P(z1), Z_STRLEN_P(z1), | ||
Z_STRVAL_P(z2), Z_STRLEN_P(z2) | ||
); | ||
|
||
case IS_ARRAY: | ||
// Do a recursive comparison. This is consistent with the test | ||
// for arrays in zend_is_identical, but unlike that method it | ||
// provides a meaningful ordering in the case of non-identity | ||
// as well. | ||
return zend_hash_compare( | ||
Z_ARRVAL_P(z1), Z_ARRVAL_P(z2), | ||
(compare_func_t) hash_zval_strict_function, 1 /* ordered */ | ||
); | ||
|
||
case IS_OBJECT: | ||
{ | ||
// Start with a recursive comparison like for arrays, for consistency. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why doesn't this only compare the object ID? That should be sufficient to establish an ordering. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because the object ID leads to nondeterministic behavior. In my experience, having an array be sorted slightly differently every time you run the program is the sort of thing that leads to headaches. Starting with a recursive comparison of attributes and properties is consistent with the way arrays are sorted, and the way that objects and arrays are compared with == (ie, that's the behavior of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (in latest patch added test cases to document predictable order of non-identical objects, dropped the class name comparison.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (and to be clear by "nondeterminism" I mean that adding a new object allocation somewhere in your program, or reordering some code which performs allocations, can cause a nonlocal change in the sort order elsewhere. I believe in PHP the object IDs are actually, strictly-speaking, deterministic given identical inputs to a program, but in practice in large programs it "feels" nondeterministic because of the effect of nonlocal changes.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Depends on what the goal here is. I don't think SORT_STRICT is particularly useful outside array_unique(). And array_unique() just needs some arbitrary order, so it should be the most efficient such order. It may be worthwhile to accept SORT_STRICT in array_unique() only, to avoid exposing any particular strict ordering to userland -- it becomes an unobservable implementation detail. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I started out thinking about just
I'm not convinced that the sort order for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It's unobservable.
https://www.php.net/array_unique
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another reason I'd personally avoid comparing properties is because recursive comparisons would be a fatal error, and recursive object references are much, much more common than recursive array references. For example, trees (e.g. Abstract Syntax Trees) might have parents reference childs, and childs reference parent (Or an object and the implementation of an object that handles details of serializing the object to a cache such as memcache) Comparing two different MyObject instances would result in a fatal error from infinite recursion detection. php > class MyObject { public $loader; public $other; }
php > class MyLoader { public $object; }
php > $object = new MyObject();
php > $object->loader = new MyLoader();
php > $object->loader->object = $object;
php > var_dump($object);
object(MyObject)#1 (2) {
["loader"]=>
object(MyLoader)#2 (1) {
["object"]=>
*RECURSION*
}
["other"]=>
NULL
} |
||
// (This is deliberately not using the user-defined `compare` handler, | ||
// nor is it using zend_std_compare_objects() because that uses | ||
// zend_compare when examining properties, not a strict comparison.) | ||
zend_object *zobj1 = Z_OBJ_P(z1); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you are planning on proposing a sort function: Should this compare class names as well? E.g. with strcmp before building and comparing object properties. If not, document why not?
EDIT: document why not, I didn't see the reason why in discussion |
||
zend_object *zobj2 = Z_OBJ_P(z2); | ||
rebuild_object_properties(zobj1); | ||
rebuild_object_properties(zobj2); | ||
// zend_std_compare_objects() uses unordered comparison, but that | ||
// leads to a unpredictable sort: with unordered the properties are | ||
// compared in the order they appear in the *first* object so | ||
// `compare(a,b)` is not guaranteed to be the same as `-compare(b,a)`. | ||
int c = zend_hash_compare( | ||
zobj1->properties, zobj2->properties, | ||
(compare_func_t) hash_zval_strict_function, 1 /* ordered */ | ||
); | ||
if (c != 0) { | ||
return (c > 0) ? 1 : -1; | ||
} | ||
// Fall back on spl_object_id() value, which will probably vary | ||
// non-deterministically between runs (alas). | ||
ZEND_ASSERT(zobj1->handle != zobj2->handle); | ||
return (zobj1->handle > zobj2->handle) ? 1 : -1; | ||
} | ||
|
||
case IS_RESOURCE: | ||
// This will also likely vary non-deterministically between runs. | ||
return Z_RES_HANDLE_P(z1) > Z_RES_HANDLE_P(z2) ? 1 : -1; | ||
|
||
case IS_REFERENCE: | ||
ZEND_ASSERT(0 && "Should have been dereferenced above"); | ||
|
||
case IS_UNDEF: | ||
case IS_NULL: | ||
case IS_FALSE: | ||
case IS_TRUE: | ||
default: | ||
ZEND_ASSERT(0 && "Values w/ same type should be identical"); | ||
return 0; | ||
cscott marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
} | ||
/* }}} */ | ||
|
||
|
||
static zend_always_inline int php_array_data_compare_strict_unstable_i(Bucket *f, Bucket *s) /* {{{ */ | ||
{ | ||
return hash_zval_strict_function(&f->val, &s->val); | ||
} | ||
/* }}} */ | ||
|
||
static zend_always_inline int php_array_data_compare_numeric_unstable_i(Bucket *f, Bucket *s) /* {{{ */ | ||
{ | ||
return numeric_compare_function(&f->val, &s->val); | ||
|
@@ -405,6 +504,7 @@ DEFINE_SORT_VARIANTS(key_compare_string_case); | |
DEFINE_SORT_VARIANTS(key_compare_string); | ||
DEFINE_SORT_VARIANTS(key_compare_string_locale); | ||
DEFINE_SORT_VARIANTS(data_compare); | ||
DEFINE_SORT_VARIANTS(data_compare_strict); | ||
DEFINE_SORT_VARIANTS(data_compare_numeric); | ||
DEFINE_SORT_VARIANTS(data_compare_string_case); | ||
DEFINE_SORT_VARIANTS(data_compare_string); | ||
|
@@ -527,6 +627,14 @@ static bucket_compare_func_t php_get_data_compare_func(zend_long sort_type, int | |
} | ||
break; | ||
|
||
case PHP_SORT_STRICT: | ||
if (reverse) { | ||
return php_array_reverse_data_compare_strict; | ||
} else { | ||
return php_array_data_compare_strict; | ||
} | ||
break; | ||
|
||
case PHP_SORT_REGULAR: | ||
default: | ||
if (reverse) { | ||
|
@@ -591,6 +699,14 @@ static bucket_compare_func_t php_get_data_compare_func_unstable(zend_long sort_t | |
} | ||
break; | ||
|
||
case PHP_SORT_STRICT: | ||
if (reverse) { | ||
return php_array_reverse_data_compare_strict_unstable; | ||
} else { | ||
return php_array_data_compare_strict_unstable; | ||
} | ||
break; | ||
|
||
case PHP_SORT_REGULAR: | ||
default: | ||
if (reverse) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
--TEST-- | ||
Test array_unique() function : SORT_STRICT functionality | ||
--FILE-- | ||
<?php | ||
echo "*** Testing array_unique() : SORT_STRICT functionality ***\n"; | ||
|
||
var_dump( array_unique( [ "1234", 1234 ] ) ); | ||
var_dump( array_unique( [ "1234", "1234" ], SORT_STRICT ) ); | ||
var_dump( array_unique( [ "1234", 1234 ], SORT_STRICT ) ); | ||
|
||
var_dump( array_unique( [ 0, "0", 0.0, "0.0", '', null, null ] ) ); | ||
var_dump( array_unique( [ 0, "0", 0.0, "0.0", '', null, null ], SORT_STRICT ) ); | ||
|
||
// These are more values which are == but not === | ||
$a = (object)[]; | ||
$b = (object)[]; | ||
$a2 = [ $a ]; | ||
$b2 = [ $b ]; | ||
$a3 = (object)[ 'foo' => $a ]; | ||
$b3 = (object)[ 'foo' => $b ]; | ||
var_dump( $a == $b && $a2 == $b2 && $a3 == $b3 ); | ||
var_dump( $a === $b || $a2 === $b2 || $a3 === $b3 ); | ||
|
||
var_dump( count( array_unique( [ $a, $b, $a2, $b2, $a3, $b3 ], SORT_STRICT ) ) ); | ||
|
||
?> | ||
--EXPECT-- | ||
*** Testing array_unique() : SORT_STRICT functionality *** | ||
array(1) { | ||
[0]=> | ||
string(4) "1234" | ||
} | ||
array(1) { | ||
[0]=> | ||
string(4) "1234" | ||
} | ||
array(2) { | ||
[0]=> | ||
string(4) "1234" | ||
[1]=> | ||
int(1234) | ||
} | ||
array(3) { | ||
[0]=> | ||
int(0) | ||
[3]=> | ||
string(3) "0.0" | ||
[4]=> | ||
string(0) "" | ||
} | ||
array(6) { | ||
[0]=> | ||
int(0) | ||
[1]=> | ||
string(1) "0" | ||
[2]=> | ||
float(0) | ||
[3]=> | ||
string(3) "0.0" | ||
[4]=> | ||
string(0) "" | ||
[5]=> | ||
NULL | ||
} | ||
bool(true) | ||
bool(false) | ||
int(6) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.