From 0a6d78d6f5da148c366d642cf6b0c01bd942ef8a Mon Sep 17 00:00:00 2001 From: mrrobot-2000 Date: Mon, 9 Mar 2020 11:06:59 +0530 Subject: [PATCH 01/14] Overload == and isequal to check schema equality --- src/schema.jl | 27 +++++++++++++++++++++++++++ src/terms.jl | 40 ++++++++++++++++++++++++++++++++++++++++ test/schema.jl | 13 ++++++++++--- 3 files changed, 77 insertions(+), 3 deletions(-) diff --git a/src/schema.jl b/src/schema.jl index 7ef8e88f..3c0d2852 100644 --- a/src/schema.jl +++ b/src/schema.jl @@ -53,6 +53,33 @@ Base.merge!(a::Schema, b::Schema) = (merge!(a.schema, b.schema); a) Base.keys(schema::Schema) = keys(schema.schema) Base.haskey(schema::Schema, key) = haskey(schema.schema, key) +function ==(first::Schema, second::Schema) + first === second && return true + first.schema === second.schema && return true + if length(first.schema) != length(second.schema) + return false + end + for key in keys(first) + !haskey(second, key) && + get(second, key, nothing) != get(first, key, nothing) && return false + end + true +end + +function Base.isequal(first::Schema, second::Schema) + first === second && return true + first.schema === second.schema && return true + if length(first.schema) != length(second.schema) + return false + end + for key in keys(first) + !haskey(second, key) && + isequal(get(second, key, nothing) != get(first, key, nothing)) && + return false + end + true +end + """ schema([terms::AbstractVector{<:AbstractTerm}, ]data, hints::Dict{Symbol}) schema(term::AbstractTerm, data, hints::Dict{Symbol}) diff --git a/src/terms.jl b/src/terms.jl index 2ee5f785..4200ded0 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -1,3 +1,4 @@ +import Base.== , Base.isequal abstract type AbstractTerm end const TermOrTerms = Union{AbstractTerm, NTuple{N, AbstractTerm} where N} const TupleTerm = NTuple{N, TermOrTerms} where N @@ -38,6 +39,8 @@ struct ConstantTerm{T<:Number} <: AbstractTerm end width(::ConstantTerm) = 1 +==(first::ConstantTerm, second::ConstantTerm) = first.n == second.n +isequal(first::ConstantTerm, second::ConstantTerm) = isequal(first.n, second.n) """ FormulaTerm{L,R} <: AbstractTerm @@ -54,6 +57,11 @@ struct FormulaTerm{L,R} <: AbstractTerm rhs::R end +==(first::FormulaTerm, second::FormulaTerm) = first.lhs == second.lhs && + first.rhs == second.rhs +isequal(first::FormulaTerm, second::FormulaTerm) = isequal(first.lhs, second.lhs) && + isequal(first.rhs, second.rhs) + """ FunctionTerm{Forig,Fanon,Names} <: AbstractTerm @@ -127,6 +135,12 @@ FunctionTerm(forig::Fo, fanon::Fa, names::NTuple{N,Symbol}, FunctionTerm{Fo, Fa, names}(forig, fanon, exorig, args_parsed) width(::FunctionTerm) = 1 +==(first::FunctionTerm, second::FunctionTerm) = first.forig == second.forig && + first.args_parsed == second.args_parsed +isequal(first::FunctionTerm, second::FunctionTerm) = + isequal(first.forig, second.forig) && + isequal(first.args_parsed, second.args_parsed) + """ InteractionTerm{Ts} <: AbstractTerm @@ -174,6 +188,10 @@ struct InteractionTerm{Ts} <: AbstractTerm end width(ts::InteractionTerm) = prod(width(t) for t in ts.terms) +==(first::InteractionTerm, second::InteractionTerm) = + first.terms == second.terms +isequal(first::InteractionTerm, second::InteractionTerm) = + isequal(first.terms, second.terms) """ InterceptTerm{HasIntercept} <: AbstractTerm @@ -187,6 +205,10 @@ via the [`implicit_intercept`](@ref) trait). struct InterceptTerm{HasIntercept} <: AbstractTerm end width(::InterceptTerm{H}) where {H} = H ? 1 : 0 +==(first::InterceptTerm, second::InterceptTerm) = width(first) == width(second) +isequal(first::InterceptTerm, second::InterceptTerm) = + isequal(width(first), width(second)) + # Typed terms """ @@ -211,6 +233,14 @@ struct ContinuousTerm{T} <: AbstractTerm end width(::ContinuousTerm) = 1 +==(first::ContinuousTerm, second::ContinuousTerm) = first.sym == second.sym && + first.mean == second.mean && first.var == second.var && + first.min == second.min && first.max == second.max + +isequal(first::ContinuousTerm, second::ContinuousTerm) = + isequal(first.sym, second.sym) && isequal(first.mean, second.mean) && + isequal(first.var, second.var) && isequal(first.min, second.min) && + isequal(first.max, second.max) """ CategoricalTerm{C,T,N} <: AbstractTerm @@ -233,6 +263,12 @@ width(::CategoricalTerm{C,T,N}) where {C,T,N} = N CategoricalTerm(sym::Symbol, contrasts::ContrastsMatrix{C,T}) where {C,T} = CategoricalTerm{C,T,length(contrasts.termnames)}(sym, contrasts) +==(first::CategoricalTerm, second::CategoricalTerm) = + first.sym == second.sym && width(first) == width(second) && + first.contrasts == second.contrasts +isequal(first::CategoricalTerm, second::CategoricalTerm) = + isequal(first.sym, second.sym) && isequal(width(first), width(second)) && + isequal(first.contrasts, second.contrasts) """ MatrixTerm{Ts} <: AbstractTerm @@ -250,6 +286,10 @@ end MatrixTerm(t::AbstractTerm) = MatrixTerm((t, )) width(t::MatrixTerm) = sum(width(tt) for tt in t.terms) +# ==(first::MatrixTerm, second::MatrixTerm) = +# collect_matrix_terms(first.terms) == collect_matrix_terms(second.terms) +# isequal(first::MatrixTerm, second::MatrixTerm) = +# isequal(collect_matrix_terms(first.terms), collect_matrix_terms(second.terms)) """ collect_matrix_terms(ts::TupleTerm) collect_matrix_terms(t::AbstractTerm) = collect_matrix_term((t, )) diff --git a/test/schema.jl b/test/schema.jl index 7786f44b..5b25d032 100644 --- a/test/schema.jl +++ b/test/schema.jl @@ -1,10 +1,17 @@ @testset "schemas" begin - + import Base using StatsModels: schema, apply_schema, FullRank - f = @formula(y ~ 1 + a + b + c + b&c) - df = (y = rand(9), a = 1:9, b = rand(9), c = repeat(["d","e","f"], 3)) + f = @formula(y ~ 1 + a + b + c + b & c) + y = rand(9) + b = rand(9) + + df = (y = y, a = 1:9, b = b, c = repeat(["d", "e", "f"], 3)) f = apply_schema(f, schema(f, df)) @test f == apply_schema(f, schema(f, df)) + df2 = (y = y, a = 1:9, b = b, c = repeat(["d", "e", "f"], 3)) + + @test schema(df) == schema(df2) + @test isequal(schema(df), schema(df2)) end From 1ca55c0a79b3b4db9f7558e67a08f815089f2a93 Mon Sep 17 00:00:00 2001 From: mrrobot-2000 Date: Wed, 11 Mar 2020 23:32:55 +0530 Subject: [PATCH 02/14] Remove unneccessary changes. --- src/schema.jl | 14 +++++--------- src/terms.jl | 7 +++++-- test/schema.jl | 24 +++++++++++++++++++++++- 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/src/schema.jl b/src/schema.jl index 3c0d2852..29da2f2a 100644 --- a/src/schema.jl +++ b/src/schema.jl @@ -56,11 +56,9 @@ Base.haskey(schema::Schema, key) = haskey(schema.schema, key) function ==(first::Schema, second::Schema) first === second && return true first.schema === second.schema && return true - if length(first.schema) != length(second.schema) - return false - end + length(first.schema) != length(second.schema) && return false for key in keys(first) - !haskey(second, key) && + !haskey(second, key) && return false get(second, key, nothing) != get(first, key, nothing) && return false end true @@ -69,12 +67,10 @@ end function Base.isequal(first::Schema, second::Schema) first === second && return true first.schema === second.schema && return true - if length(first.schema) != length(second.schema) - return false - end + length(first.schema) != length(second.schema) && return false for key in keys(first) - !haskey(second, key) && - isequal(get(second, key, nothing) != get(first, key, nothing)) && + !haskey(second, key) && return false + !isequal(get(second, key, nothing), get(first, key, nothing)) && return false end true diff --git a/src/terms.jl b/src/terms.jl index 4200ded0..4f202801 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -57,8 +57,8 @@ struct FormulaTerm{L,R} <: AbstractTerm rhs::R end -==(first::FormulaTerm, second::FormulaTerm) = first.lhs == second.lhs && - first.rhs == second.rhs +==(first::FormulaTerm, second::FormulaTerm) = + first.lhs == second.lhs && first.rhs == second.rhs isequal(first::FormulaTerm, second::FormulaTerm) = isequal(first.lhs, second.lhs) && isequal(first.rhs, second.rhs) @@ -290,6 +290,9 @@ width(t::MatrixTerm) = sum(width(tt) for tt in t.terms) # collect_matrix_terms(first.terms) == collect_matrix_terms(second.terms) # isequal(first::MatrixTerm, second::MatrixTerm) = # isequal(collect_matrix_terms(first.terms), collect_matrix_terms(second.terms)) +==(first::MatrixTerm, second::MatrixTerm) = first.terms == second.terms +isequal(first::MatrixTerm, second::MatrixTerm) = isequal(first.terms, second.terms) + """ collect_matrix_terms(ts::TupleTerm) collect_matrix_terms(t::AbstractTerm) = collect_matrix_term((t, )) diff --git a/test/schema.jl b/test/schema.jl index 5b25d032..0c564d79 100644 --- a/test/schema.jl +++ b/test/schema.jl @@ -10,8 +10,30 @@ f = apply_schema(f, schema(f, df)) @test f == apply_schema(f, schema(f, df)) - df2 = (y = y, a = 1:9, b = b, c = repeat(["d", "e", "f"], 3)) + df2 = (y = y, a = 1:9, b = b, c = [df.c; df.c]) + df3 = (y = y, a = 1:9, b = b, c = repeat(["a", "b", "c"], 3)) + df4 = (y = [df.y; df.y], a = [1:9; 1:9], b = [b; b], c = [df.c; df.c]) + df5 = (z = y, a = 1:9, b = b, c = repeat(["d", "e", "f"], 3)) + + sch = schema(df, Dict(:c => DummyCoding(base="e"))) + sch2 = schema(df, Dict(:c => EffectsCoding(base="e"))) @test schema(df) == schema(df2) + @test schema(df) != schema(df3) + @test schema(df) != schema(df4) + @test schema(df) != schema(df5) + @test sch != sch2 + @test isequal(schema(df), schema(df2)) + @test !isequal(schema(df), schema(df3)) + @test !isequal(schema(df), schema(df4)) + @test !isequal(schema(df), schema(df5)) + @test !isequal(sch, sch2) + + # @test schema(df) == schema(df3) + # @test isequal(schema(df), schema(df3)) + #@test schema(df) != schema(df4) + #@test isequal(schema(df), schema(df4)) + + end From e6f8b87a8e8155bc4260de5ca144b2413e546622 Mon Sep 17 00:00:00 2001 From: mrrobot-2000 Date: Mon, 16 Mar 2020 13:37:54 +0530 Subject: [PATCH 03/14] Consistent formatting style and add more tests. --- src/schema.jl | 5 ++--- src/terms.jl | 40 ++++++++++++++++++++++++---------------- test/schema.jl | 20 ++++++++++++-------- 3 files changed, 38 insertions(+), 27 deletions(-) diff --git a/src/schema.jl b/src/schema.jl index 29da2f2a..22dcab70 100644 --- a/src/schema.jl +++ b/src/schema.jl @@ -59,7 +59,7 @@ function ==(first::Schema, second::Schema) length(first.schema) != length(second.schema) && return false for key in keys(first) !haskey(second, key) && return false - get(second, key, nothing) != get(first, key, nothing) && return false + second[key] != first[key] && return false end true end @@ -70,8 +70,7 @@ function Base.isequal(first::Schema, second::Schema) length(first.schema) != length(second.schema) && return false for key in keys(first) !haskey(second, key) && return false - !isequal(get(second, key, nothing), get(first, key, nothing)) && - return false + !isequal(second[key], first[key]) && return false end true end diff --git a/src/terms.jl b/src/terms.jl index 4f202801..b3c37e7d 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -58,8 +58,10 @@ struct FormulaTerm{L,R} <: AbstractTerm end ==(first::FormulaTerm, second::FormulaTerm) = - first.lhs == second.lhs && first.rhs == second.rhs -isequal(first::FormulaTerm, second::FormulaTerm) = isequal(first.lhs, second.lhs) && + first.lhs == second.lhs && + first.rhs == second.rhs +isequal(first::FormulaTerm, second::FormulaTerm) = + isequal(first.lhs, second.lhs) && isequal(first.rhs, second.rhs) """ @@ -135,7 +137,8 @@ FunctionTerm(forig::Fo, fanon::Fa, names::NTuple{N,Symbol}, FunctionTerm{Fo, Fa, names}(forig, fanon, exorig, args_parsed) width(::FunctionTerm) = 1 -==(first::FunctionTerm, second::FunctionTerm) = first.forig == second.forig && +==(first::FunctionTerm, second::FunctionTerm) = + first.forig == second.forig && first.args_parsed == second.args_parsed isequal(first::FunctionTerm, second::FunctionTerm) = isequal(first.forig, second.forig) && @@ -205,7 +208,8 @@ via the [`implicit_intercept`](@ref) trait). struct InterceptTerm{HasIntercept} <: AbstractTerm end width(::InterceptTerm{H}) where {H} = H ? 1 : 0 -==(first::InterceptTerm, second::InterceptTerm) = width(first) == width(second) +==(first::InterceptTerm, second::InterceptTerm) = + width(first) == width(second) isequal(first::InterceptTerm, second::InterceptTerm) = isequal(width(first), width(second)) @@ -234,12 +238,16 @@ end width(::ContinuousTerm) = 1 ==(first::ContinuousTerm, second::ContinuousTerm) = first.sym == second.sym && - first.mean == second.mean && first.var == second.var && - first.min == second.min && first.max == second.max + first.mean == second.mean && + first.var == second.var && + first.min == second.min && + first.max == second.max isequal(first::ContinuousTerm, second::ContinuousTerm) = - isequal(first.sym, second.sym) && isequal(first.mean, second.mean) && - isequal(first.var, second.var) && isequal(first.min, second.min) && + isequal(first.sym, second.sym) && + isequal(first.mean, second.mean) && + isequal(first.var, second.var) && + isequal(first.min, second.min) && isequal(first.max, second.max) """ CategoricalTerm{C,T,N} <: AbstractTerm @@ -264,10 +272,12 @@ CategoricalTerm(sym::Symbol, contrasts::ContrastsMatrix{C,T}) where {C,T} = CategoricalTerm{C,T,length(contrasts.termnames)}(sym, contrasts) ==(first::CategoricalTerm, second::CategoricalTerm) = - first.sym == second.sym && width(first) == width(second) && + first.sym == second.sym && + width(first) == width(second) && first.contrasts == second.contrasts isequal(first::CategoricalTerm, second::CategoricalTerm) = - isequal(first.sym, second.sym) && isequal(width(first), width(second)) && + isequal(first.sym, second.sym) && + isequal(width(first), width(second)) && isequal(first.contrasts, second.contrasts) """ MatrixTerm{Ts} <: AbstractTerm @@ -286,12 +296,10 @@ end MatrixTerm(t::AbstractTerm) = MatrixTerm((t, )) width(t::MatrixTerm) = sum(width(tt) for tt in t.terms) -# ==(first::MatrixTerm, second::MatrixTerm) = -# collect_matrix_terms(first.terms) == collect_matrix_terms(second.terms) -# isequal(first::MatrixTerm, second::MatrixTerm) = -# isequal(collect_matrix_terms(first.terms), collect_matrix_terms(second.terms)) -==(first::MatrixTerm, second::MatrixTerm) = first.terms == second.terms -isequal(first::MatrixTerm, second::MatrixTerm) = isequal(first.terms, second.terms) +==(first::MatrixTerm, second::MatrixTerm) = + first.terms == second.terms +isequal(first::MatrixTerm, second::MatrixTerm) = + isequal(first.terms, second.terms) """ collect_matrix_terms(ts::TupleTerm) diff --git a/test/schema.jl b/test/schema.jl index 0c564d79..89d93d03 100644 --- a/test/schema.jl +++ b/test/schema.jl @@ -1,8 +1,7 @@ @testset "schemas" begin - import Base using StatsModels: schema, apply_schema, FullRank - f = @formula(y ~ 1 + a + b + c + b & c) + f = @formula(y ~ 1 + a + log(b) + c + b & c) y = rand(9) b = rand(9) @@ -14,6 +13,9 @@ df3 = (y = y, a = 1:9, b = b, c = repeat(["a", "b", "c"], 3)) df4 = (y = [df.y; df.y], a = [1:9; 1:9], b = [b; b], c = [df.c; df.c]) df5 = (z = y, a = 1:9, b = b, c = repeat(["d", "e", "f"], 3)) + df6 = (y = y, a = 2:10, b = b, c = repeat(["a", "b", "c"], 3)) + df7 = (w = y, d = 1:9, x = b, z = repeat(["d", "e", "f"], 3)) + df8 = (y = y, a = 1:9, c = repeat(["d", "e", "f"], 3)) sch = schema(df, Dict(:c => DummyCoding(base="e"))) sch2 = schema(df, Dict(:c => EffectsCoding(base="e"))) @@ -22,18 +24,20 @@ @test schema(df) != schema(df3) @test schema(df) != schema(df4) @test schema(df) != schema(df5) + @test schema(df) != schema(df6) + @test schema(df) != schema(df7) + @test schema(df) != schema(df8) + @test schema(df8) != schema(df) @test sch != sch2 @test isequal(schema(df), schema(df2)) @test !isequal(schema(df), schema(df3)) @test !isequal(schema(df), schema(df4)) @test !isequal(schema(df), schema(df5)) + @test !isequal(schema(df), schema(df6)) + @test !isequal(schema(df), schema(df7)) + @test !isequal(schema(df), schema(df8)) + @test !isequal(schema(df8), schema(df)) @test !isequal(sch, sch2) - # @test schema(df) == schema(df3) - # @test isequal(schema(df), schema(df3)) - #@test schema(df) != schema(df4) - #@test isequal(schema(df), schema(df4)) - - end From 2f03a7c43eaa10a81d6e98d65a9b90c9d02dbdff Mon Sep 17 00:00:00 2001 From: mrrobot-2000 <60689620+mrrobot-2000@users.noreply.github.com> Date: Mon, 16 Mar 2020 13:46:00 +0530 Subject: [PATCH 04/14] Update src/terms.jl Co-Authored-By: Dave Kleinschmidt --- src/terms.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/terms.jl b/src/terms.jl index b3c37e7d..2fe788a6 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -237,7 +237,8 @@ struct ContinuousTerm{T} <: AbstractTerm end width(::ContinuousTerm) = 1 -==(first::ContinuousTerm, second::ContinuousTerm) = first.sym == second.sym && +==(first::ContinuousTerm, second::ContinuousTerm) = + first.sym == second.sym && first.mean == second.mean && first.var == second.var && first.min == second.min && From ef367d78397ee42f48b67a61ca1acc990ea3a319 Mon Sep 17 00:00:00 2001 From: mrrobot-2000 Date: Mon, 16 Mar 2020 23:14:25 +0530 Subject: [PATCH 05/14] Add more tests. --- test/schema.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/schema.jl b/test/schema.jl index 89d93d03..9b6252a5 100644 --- a/test/schema.jl +++ b/test/schema.jl @@ -21,6 +21,7 @@ sch2 = schema(df, Dict(:c => EffectsCoding(base="e"))) @test schema(df) == schema(df2) + @test apply_schema(schema(df)) == apply_schema(schema(df2)) @test schema(df) != schema(df3) @test schema(df) != schema(df4) @test schema(df) != schema(df5) @@ -28,9 +29,11 @@ @test schema(df) != schema(df7) @test schema(df) != schema(df8) @test schema(df8) != schema(df) + @test apply_schema(schema(df)) != apply_schema(schema(df5)) @test sch != sch2 @test isequal(schema(df), schema(df2)) + @test isequal(apply_schema(schema(df)), apply_schema(schema(df2))) @test !isequal(schema(df), schema(df3)) @test !isequal(schema(df), schema(df4)) @test !isequal(schema(df), schema(df5)) @@ -38,6 +41,7 @@ @test !isequal(schema(df), schema(df7)) @test !isequal(schema(df), schema(df8)) @test !isequal(schema(df8), schema(df)) + @test !isequal(apply_schema(schema(df)), apply_schema(schema(df5))) @test !isequal(sch, sch2) end From 993829b8e0b0b7cc8891c778e34c9629360dcaa6 Mon Sep 17 00:00:00 2001 From: mrrobot-2000 Date: Mon, 16 Mar 2020 23:50:14 +0530 Subject: [PATCH 06/14] Fix broken tests. --- test/schema.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/schema.jl b/test/schema.jl index 9b6252a5..5f3b1056 100644 --- a/test/schema.jl +++ b/test/schema.jl @@ -21,7 +21,7 @@ sch2 = schema(df, Dict(:c => EffectsCoding(base="e"))) @test schema(df) == schema(df2) - @test apply_schema(schema(df)) == apply_schema(schema(df2)) + @test apply_schema(f, schema(df)) == apply_schema(f, schema(df2)) @test schema(df) != schema(df3) @test schema(df) != schema(df4) @test schema(df) != schema(df5) @@ -29,11 +29,11 @@ @test schema(df) != schema(df7) @test schema(df) != schema(df8) @test schema(df8) != schema(df) - @test apply_schema(schema(df)) != apply_schema(schema(df5)) + @test apply_schema(f, schema(df)) != apply_schema(f, schema(df5)) @test sch != sch2 @test isequal(schema(df), schema(df2)) - @test isequal(apply_schema(schema(df)), apply_schema(schema(df2))) + @test isequal(apply_schema(f, schema(df)), apply_schema(f, schema(df2))) @test !isequal(schema(df), schema(df3)) @test !isequal(schema(df), schema(df4)) @test !isequal(schema(df), schema(df5)) @@ -41,7 +41,7 @@ @test !isequal(schema(df), schema(df7)) @test !isequal(schema(df), schema(df8)) @test !isequal(schema(df8), schema(df)) - @test !isequal(apply_schema(schema(df)), apply_schema(schema(df5))) + @test !isequal(apply_schema(f, schema(df)), apply_schema(f, schema(df5))) @test !isequal(sch, sch2) end From ee16796745d61c71632b684a4c00d57c3e344dfc Mon Sep 17 00:00:00 2001 From: mrrobot-2000 Date: Tue, 31 Mar 2020 18:58:43 +0530 Subject: [PATCH 07/14] Fix broken tests. --- test/schema.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/schema.jl b/test/schema.jl index 5f3b1056..b4521648 100644 --- a/test/schema.jl +++ b/test/schema.jl @@ -29,7 +29,7 @@ @test schema(df) != schema(df7) @test schema(df) != schema(df8) @test schema(df8) != schema(df) - @test apply_schema(f, schema(df)) != apply_schema(f, schema(df5)) + @test apply_schema(f, schema(df)) == apply_schema(f, schema(df5)) @test sch != sch2 @test isequal(schema(df), schema(df2)) @@ -41,7 +41,7 @@ @test !isequal(schema(df), schema(df7)) @test !isequal(schema(df), schema(df8)) @test !isequal(schema(df8), schema(df)) - @test !isequal(apply_schema(f, schema(df)), apply_schema(f, schema(df5))) + @test isequal(apply_schema(f, schema(df)), apply_schema(f, schema(df5))) @test !isequal(sch, sch2) end From fcf551d312baf03fea20d5bcfbb8ab2e28a1a17b Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Tue, 21 Sep 2021 09:26:50 -0400 Subject: [PATCH 08/14] remove isequal methods (default fallback uses ==) --- src/schema.jl | 11 ----------- src/terms.jl | 28 ++++------------------------ test/schema.jl | 12 ------------ 3 files changed, 4 insertions(+), 47 deletions(-) diff --git a/src/schema.jl b/src/schema.jl index 22dcab70..25f8a1b2 100644 --- a/src/schema.jl +++ b/src/schema.jl @@ -64,17 +64,6 @@ function ==(first::Schema, second::Schema) true end -function Base.isequal(first::Schema, second::Schema) - first === second && return true - first.schema === second.schema && return true - length(first.schema) != length(second.schema) && return false - for key in keys(first) - !haskey(second, key) && return false - !isequal(second[key], first[key]) && return false - end - true -end - """ schema([terms::AbstractVector{<:AbstractTerm}, ]data, hints::Dict{Symbol}) schema(term::AbstractTerm, data, hints::Dict{Symbol}) diff --git a/src/terms.jl b/src/terms.jl index 2fe788a6..29d6fa97 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -1,4 +1,4 @@ -import Base.== , Base.isequal +import Base.== abstract type AbstractTerm end const TermOrTerms = Union{AbstractTerm, NTuple{N, AbstractTerm} where N} const TupleTerm = NTuple{N, TermOrTerms} where N @@ -40,7 +40,7 @@ end width(::ConstantTerm) = 1 ==(first::ConstantTerm, second::ConstantTerm) = first.n == second.n -isequal(first::ConstantTerm, second::ConstantTerm) = isequal(first.n, second.n) + """ FormulaTerm{L,R} <: AbstractTerm @@ -60,9 +60,6 @@ end ==(first::FormulaTerm, second::FormulaTerm) = first.lhs == second.lhs && first.rhs == second.rhs -isequal(first::FormulaTerm, second::FormulaTerm) = - isequal(first.lhs, second.lhs) && - isequal(first.rhs, second.rhs) """ FunctionTerm{Forig,Fanon,Names} <: AbstractTerm @@ -140,9 +137,6 @@ width(::FunctionTerm) = 1 ==(first::FunctionTerm, second::FunctionTerm) = first.forig == second.forig && first.args_parsed == second.args_parsed -isequal(first::FunctionTerm, second::FunctionTerm) = - isequal(first.forig, second.forig) && - isequal(first.args_parsed, second.args_parsed) """ InteractionTerm{Ts} <: AbstractTerm @@ -193,8 +187,7 @@ width(ts::InteractionTerm) = prod(width(t) for t in ts.terms) ==(first::InteractionTerm, second::InteractionTerm) = first.terms == second.terms -isequal(first::InteractionTerm, second::InteractionTerm) = - isequal(first.terms, second.terms) + """ InterceptTerm{HasIntercept} <: AbstractTerm @@ -210,8 +203,6 @@ width(::InterceptTerm{H}) where {H} = H ? 1 : 0 ==(first::InterceptTerm, second::InterceptTerm) = width(first) == width(second) -isequal(first::InterceptTerm, second::InterceptTerm) = - isequal(width(first), width(second)) # Typed terms @@ -244,12 +235,6 @@ width(::ContinuousTerm) = 1 first.min == second.min && first.max == second.max -isequal(first::ContinuousTerm, second::ContinuousTerm) = - isequal(first.sym, second.sym) && - isequal(first.mean, second.mean) && - isequal(first.var, second.var) && - isequal(first.min, second.min) && - isequal(first.max, second.max) """ CategoricalTerm{C,T,N} <: AbstractTerm @@ -276,10 +261,7 @@ CategoricalTerm(sym::Symbol, contrasts::ContrastsMatrix{C,T}) where {C,T} = first.sym == second.sym && width(first) == width(second) && first.contrasts == second.contrasts -isequal(first::CategoricalTerm, second::CategoricalTerm) = - isequal(first.sym, second.sym) && - isequal(width(first), width(second)) && - isequal(first.contrasts, second.contrasts) + """ MatrixTerm{Ts} <: AbstractTerm @@ -299,8 +281,6 @@ width(t::MatrixTerm) = sum(width(tt) for tt in t.terms) ==(first::MatrixTerm, second::MatrixTerm) = first.terms == second.terms -isequal(first::MatrixTerm, second::MatrixTerm) = - isequal(first.terms, second.terms) """ collect_matrix_terms(ts::TupleTerm) diff --git a/test/schema.jl b/test/schema.jl index b4521648..f348e202 100644 --- a/test/schema.jl +++ b/test/schema.jl @@ -32,16 +32,4 @@ @test apply_schema(f, schema(df)) == apply_schema(f, schema(df5)) @test sch != sch2 - @test isequal(schema(df), schema(df2)) - @test isequal(apply_schema(f, schema(df)), apply_schema(f, schema(df2))) - @test !isequal(schema(df), schema(df3)) - @test !isequal(schema(df), schema(df4)) - @test !isequal(schema(df), schema(df5)) - @test !isequal(schema(df), schema(df6)) - @test !isequal(schema(df), schema(df7)) - @test !isequal(schema(df), schema(df8)) - @test !isequal(schema(df8), schema(df)) - @test isequal(apply_schema(f, schema(df)), apply_schema(f, schema(df5))) - @test !isequal(sch, sch2) - end From 6850ecf76cd546a4d4e1250fd95498f2fe618090 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Tue, 21 Sep 2021 10:03:18 -0400 Subject: [PATCH 09/14] hash schema and test --- src/schema.jl | 2 ++ test/schema.jl | 44 +++++++++++++++++++++++++++++++++----------- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/src/schema.jl b/src/schema.jl index 25f8a1b2..3731f0fa 100644 --- a/src/schema.jl +++ b/src/schema.jl @@ -64,6 +64,8 @@ function ==(first::Schema, second::Schema) true end +Base.hash(schema::Schema, h::UInt) = hash(schema.schema, h) + """ schema([terms::AbstractVector{<:AbstractTerm}, ]data, hints::Dict{Symbol}) schema(term::AbstractTerm, data, hints::Dict{Symbol}) diff --git a/test/schema.jl b/test/schema.jl index f348e202..90ef63d6 100644 --- a/test/schema.jl +++ b/test/schema.jl @@ -9,27 +9,49 @@ f = apply_schema(f, schema(f, df)) @test f == apply_schema(f, schema(f, df)) + @testset "basic hash and equality" begin + sch1 = schema(f, df) + sch2 = schema(f, df) + @test sch1 == sch2 + @test sch1 !== sch2 + @test hash(sch1) == hash(sch2) + end + + # double categorical column c to test for invariance based on levels df2 = (y = y, a = 1:9, b = b, c = [df.c; df.c]) + @test schema(df) == schema(df2) + @test hash(schema(df)) == hash(schema(df2)) + @test apply_schema(f, schema(df)) == apply_schema(f, schema(df2)) + + # different levels df3 = (y = y, a = 1:9, b = b, c = repeat(["a", "b", "c"], 3)) + @test schema(df) != schema(df3) + + # different length, so different summary stats for continuous df4 = (y = [df.y; df.y], a = [1:9; 1:9], b = [b; b], c = [df.c; df.c]) + @test schema(df) != schema(df4) + + # different names for some columns df5 = (z = y, a = 1:9, b = b, c = repeat(["d", "e", "f"], 3)) + @test schema(df) != schema(df5) + + # different values in continuous column so different stats df6 = (y = y, a = 2:10, b = b, c = repeat(["a", "b", "c"], 3)) + @test schema(df) != schema(df6) + + # different names? df7 = (w = y, d = 1:9, x = b, z = repeat(["d", "e", "f"], 3)) + @test schema(df) != schema(df7) + + # missing column df8 = (y = y, a = 1:9, c = repeat(["d", "e", "f"], 3)) + @test schema(df) != schema(df8) + # different coding/hints sch = schema(df, Dict(:c => DummyCoding(base="e"))) sch2 = schema(df, Dict(:c => EffectsCoding(base="e"))) - - @test schema(df) == schema(df2) - @test apply_schema(f, schema(df)) == apply_schema(f, schema(df2)) - @test schema(df) != schema(df3) - @test schema(df) != schema(df4) - @test schema(df) != schema(df5) - @test schema(df) != schema(df6) - @test schema(df) != schema(df7) - @test schema(df) != schema(df8) - @test schema(df8) != schema(df) - @test apply_schema(f, schema(df)) == apply_schema(f, schema(df5)) + sch3 = schema(df, Dict(:y => DummyCoding())) @test sch != sch2 + @test sch != sch3 end From b62009eb13fef1b6af46c1bab8822014121e5508 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Tue, 21 Sep 2021 10:03:56 -0400 Subject: [PATCH 10/14] hash terms and add a few tests --- src/terms.jl | 6 +++++- test/terms.jl | 11 +++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/terms.jl b/src/terms.jl index 29d6fa97..da85a20b 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -3,6 +3,9 @@ abstract type AbstractTerm end const TermOrTerms = Union{AbstractTerm, NTuple{N, AbstractTerm} where N} const TupleTerm = NTuple{N, TermOrTerms} where N +Base.hash(term::T, h::UInt) where {T<:AbstractTerm} = + foldr(hash, getfield(term, field) for field in fieldnames(T); init=h) + width(::T) where {T<:AbstractTerm} = throw(ArgumentError("terms of type $T have undefined width")) @@ -136,7 +139,8 @@ width(::FunctionTerm) = 1 ==(first::FunctionTerm, second::FunctionTerm) = first.forig == second.forig && - first.args_parsed == second.args_parsed + first.exorig == second.exorig +Base.hash(term::FunctionTerm, h::UInt) = hash(term.forig, hash(term.exorig, h)) """ InteractionTerm{Ts} <: AbstractTerm diff --git a/test/terms.jl b/test/terms.jl index 300b68f4..14004118 100644 --- a/test/terms.jl +++ b/test/terms.jl @@ -28,26 +28,36 @@ StatsModels.apply_schema(mt::MultiTerm, sch::StatsModels.Schema, Mod::Type) = @test t0.var == var([1,2,3]) @test t0.min == 1.0 @test t0.max == 3.0 + @test t0 == concrete_term(t, [3, 2, 1]) + @test hash(t0) == hash(concrete_term(t, [3, 2, 1])) t1 = concrete_term(t, [:a, :b, :c]) @test t1.contrasts isa StatsModels.ContrastsMatrix{DummyCoding} @test string(t1) == "aaa" @test mimestring(t1) == "aaa(DummyCoding:3→2)" + @test t1 == concrete_term(t, [:a, :b, :c]) + @test t1 !== concrete_term(t, [:a, :b, :c]) + @test hash(t1) == hash(concrete_term(t, [:a, :b, :c])) t3 = concrete_term(t, [:a, :b, :c], DummyCoding()) @test t3.contrasts isa StatsModels.ContrastsMatrix{DummyCoding} @test string(t3) == "aaa" @test mimestring(t3) == "aaa(DummyCoding:3→2)" + @test t1 == t3 + @test hash(t1) == hash(t3) t2 = concrete_term(t, [:a, :a, :b], EffectsCoding()) @test t2.contrasts isa StatsModels.ContrastsMatrix{EffectsCoding} @test mimestring(t2) == "aaa(EffectsCoding:2→1)" @test string(t2) == "aaa" + @test t2 == concrete_term(t, [:a, :a, :b], EffectsCoding()) + @test t1 != t2 t2full = concrete_term(t, [:a, :a, :b], StatsModels.FullDummyCoding()) @test t2full.contrasts isa StatsModels.ContrastsMatrix{StatsModels.FullDummyCoding} @test mimestring(t2full) == "aaa(StatsModels.FullDummyCoding:2→2)" @test string(t2full) == "aaa" + @test t1 != t2full end @testset "term operators" begin @@ -75,6 +85,7 @@ StatsModels.apply_schema(mt::MultiTerm, sch::StatsModels.Schema, Mod::Type) = c = term(:c) @test (a+b)+c == (a,b,c) @test a+(b+c) == (a,b,c) + @test hash((a+b)+c) == hash(a+(b+c)) end @testset "expand nested tuples of terms during apply_schema" begin From 68b240aa1031c6a2425d90b749a453a165d52c2f Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Tue, 21 Sep 2021 10:16:49 -0400 Subject: [PATCH 11/14] test equality of function terms --- test/terms.jl | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/test/terms.jl b/test/terms.jl index 14004118..fec4667b 100644 --- a/test/terms.jl +++ b/test/terms.jl @@ -159,5 +159,26 @@ StatsModels.apply_schema(mt::MultiTerm, sch::StatsModels.Schema, Mod::Type) = end end - + + @testset "equality of function terms" begin + # for now, we use `@formula` to construct the function terms + f1 = @formula(0 ~ (1 | x)).rhs + f2 = @formula(0 ~ (1 | x)).rhs + @test f1 !== f2 + @test f1 == f2 + @test hash(f1) == hash(f2) + + f3 = @formula(0 ~ (1 % x)).rhs + @test f1 != f3 + @test hash(f1) != hash(f3) + + f4 = @formula(0 ~ (x | 1)).rhs + @test f1 != f4 + @test hash(f1) != hash(f4) + + f5 = @formula(0 ~ (1 & y | x)).rhs + @test f1 != f5 + @test hash(f1) != hash(f5) + end + end From 63e7ff4da42a7857df51ea58df9e73f13e82d50d Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Tue, 21 Sep 2021 10:40:57 -0400 Subject: [PATCH 12/14] one more test, 1.0 struggles with foldr and generators apparently --- src/terms.jl | 2 +- test/terms.jl | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/terms.jl b/src/terms.jl index 3c9ab8b9..a2344bd2 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -4,7 +4,7 @@ const TermOrTerms = Union{AbstractTerm, Tuple{AbstractTerm, Vararg{AbstractTerm} const TupleTerm = Tuple{TermOrTerms, Vararg{TermOrTerms}} Base.hash(term::T, h::UInt) where {T<:AbstractTerm} = - foldr(hash, getfield(term, field) for field in fieldnames(T); init=h) + foldl((h, x) -> hash(x, h), getfield(term, field) for field in fieldnames(T); init=h) width(::T) where {T<:AbstractTerm} = throw(ArgumentError("terms of type $T have undefined width")) diff --git a/test/terms.jl b/test/terms.jl index 0072ebbe..91603467 100644 --- a/test/terms.jl +++ b/test/terms.jl @@ -190,6 +190,11 @@ StatsModels.apply_schema(mt::MultiTerm, sch::StatsModels.Schema, Mod::Type) = f5 = @formula(0 ~ (1 & y | x)).rhs @test f1 != f5 @test hash(f1) != hash(f5) + + ff1 = @formula(y ~ 1 + x + x & y + (1 + x | g)) + ff2 = @formula(y ~ 1 + x + x & y + (1 + x | g)) + @test ff1 == ff2 + @test hash(ff1) == hash(ff2) end @testset "uniqueness of FunctionTerms" begin From 08a627807fea506ede7c120e5d52c0e9fb4568b5 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Tue, 21 Sep 2021 11:01:53 -0400 Subject: [PATCH 13/14] define generic fallback for == of AbstractTerms --- src/terms.jl | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/src/terms.jl b/src/terms.jl index a2344bd2..2533ac2a 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -1,4 +1,3 @@ -import Base.== abstract type AbstractTerm end const TermOrTerms = Union{AbstractTerm, Tuple{AbstractTerm, Vararg{AbstractTerm}}} const TupleTerm = Tuple{TermOrTerms, Vararg{TermOrTerms}} @@ -6,6 +5,11 @@ const TupleTerm = Tuple{TermOrTerms, Vararg{TermOrTerms}} Base.hash(term::T, h::UInt) where {T<:AbstractTerm} = foldl((h, x) -> hash(x, h), getfield(term, field) for field in fieldnames(T); init=h) +function Base.:(==)(a::A, b::B) where {A<:AbstractTerm, B<:AbstractTerm} + fieldnames(A) == fieldnames(B) || return false + return all(getfield(a, field) == getfield(b, field) for field in fieldnames(A)) +end + width(::T) where {T<:AbstractTerm} = throw(ArgumentError("terms of type $T have undefined width")) @@ -42,8 +46,6 @@ struct ConstantTerm{T<:Number} <: AbstractTerm end width(::ConstantTerm) = 1 -==(first::ConstantTerm, second::ConstantTerm) = first.n == second.n - """ FormulaTerm{L,R} <: AbstractTerm @@ -60,10 +62,6 @@ struct FormulaTerm{L,R} <: AbstractTerm rhs::R end -==(first::FormulaTerm, second::FormulaTerm) = - first.lhs == second.lhs && - first.rhs == second.rhs - """ FunctionTerm{Forig,Fanon,Names} <: AbstractTerm @@ -137,7 +135,7 @@ FunctionTerm(forig::Fo, fanon::Fa, names::NTuple{N,Symbol}, FunctionTerm{Fo, Fa, names}(forig, fanon, exorig, args_parsed) width(::FunctionTerm) = 1 -==(first::FunctionTerm, second::FunctionTerm) = +Base.:(==)(first::FunctionTerm, second::FunctionTerm) = first.forig == second.forig && first.exorig == second.exorig Base.hash(term::FunctionTerm, h::UInt) = hash(term.forig, hash(term.exorig, h)) @@ -191,9 +189,6 @@ struct InteractionTerm{Ts} <: AbstractTerm end width(ts::InteractionTerm) = prod(width(t) for t in ts.terms) -==(first::InteractionTerm, second::InteractionTerm) = - first.terms == second.terms - """ InterceptTerm{HasIntercept} <: AbstractTerm @@ -207,8 +202,7 @@ via the [`implicit_intercept`](@ref) trait). struct InterceptTerm{HasIntercept} <: AbstractTerm end width(::InterceptTerm{H}) where {H} = H ? 1 : 0 -==(first::InterceptTerm, second::InterceptTerm) = - width(first) == width(second) +Base.:(==)(first::InterceptTerm{T}, second::InterceptTerm{S}) where {T,S} = T == S # Typed terms @@ -234,13 +228,6 @@ struct ContinuousTerm{T} <: AbstractTerm end width(::ContinuousTerm) = 1 -==(first::ContinuousTerm, second::ContinuousTerm) = - first.sym == second.sym && - first.mean == second.mean && - first.var == second.var && - first.min == second.min && - first.max == second.max - """ CategoricalTerm{C,T,N} <: AbstractTerm @@ -263,11 +250,6 @@ width(::CategoricalTerm{C,T,N}) where {C,T,N} = N CategoricalTerm(sym::Symbol, contrasts::ContrastsMatrix{C,T}) where {C,T} = CategoricalTerm{C,T,length(contrasts.termnames)}(sym, contrasts) -==(first::CategoricalTerm, second::CategoricalTerm) = - first.sym == second.sym && - width(first) == width(second) && - first.contrasts == second.contrasts - """ MatrixTerm{Ts} <: AbstractTerm @@ -285,9 +267,6 @@ end MatrixTerm(t::AbstractTerm) = MatrixTerm((t, )) width(t::MatrixTerm) = sum(width(tt) for tt in t.terms) -==(first::MatrixTerm, second::MatrixTerm) = - first.terms == second.terms - """ collect_matrix_terms(ts::TupleTerm) collect_matrix_terms(t::AbstractTerm) = collect_matrix_term((t, )) From 63860c698ed55e4ce9a3d2c41502e22d90c9baf3 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Tue, 21 Sep 2021 11:03:56 -0400 Subject: [PATCH 14/14] whoops --- src/schema.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/schema.jl b/src/schema.jl index 253a6b81..0253e451 100644 --- a/src/schema.jl +++ b/src/schema.jl @@ -53,7 +53,7 @@ Base.merge!(a::Schema, b::Schema) = (merge!(a.schema, b.schema); a) Base.keys(schema::Schema) = keys(schema.schema) Base.haskey(schema::Schema, key) = haskey(schema.schema, key) -function ==(first::Schema, second::Schema) +function Base.:(==)(first::Schema, second::Schema) first === second && return true first.schema === second.schema && return true length(first.schema) != length(second.schema) && return false