MP-SPDZ/mpcstats_lib.py at mpcstats-lib · MPCStats/MP-SPDZ · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
"""
Contains functions can be used in MP-SPDZ circuits.
"""

from Compiler.library import print_ln, for_range
from Compiler.types import sint, sfix, Matrix, sfloat, Array
from Compiler.util import if_else
from Compiler.mpc_math import sqrt, exp2_fx, log2_fx


# geometric_mean assumes MAGIC_NUMBER to be non-negative
MAGIC_NUMBER = 999

# To enforce round to the nearest integer, instead of probabilistic truncation
# Ref: https://github.com/data61/MP-SPDZ/blob/e93190f3b72ee2d27837ca1ca6614df6b52ceef2/doc/machine-learning.rst?plain=1#L347-L353
sfix.round_nearest = True


def read_data(party_index: int, num_columns: int, num_rows: int) -> Matrix:
    """
    Read data from each party's input file to a Matrix in MP-SPDZ circuit.
    """
    data = Matrix(num_columns, num_rows, sint)
    # TODO: use @for_range_opt instead?
    for i in range(num_columns):
        for j in range(num_rows):
            data[i][j] = sint.get_input_from(party_index)
    return data


def print_data(data: Matrix):
    """
    Print the data in the Matrix.
    """
    num_columns = data.shape[0]
    num_rows = data.shape[1]
    for i in range(num_columns):
        for j in range(num_rows):
            print_ln("data[{}][{}]: %s".format(i, j), data[i][j].reveal())


# Top 5 functions to implement

def mean(data: list[sint]):
    total = sum(if_else(i != MAGIC_NUMBER, i, 0) for i in data)
    count = sum(if_else(i != MAGIC_NUMBER, 1, 0) for i in data)
    return total / count


def median(data: list[sint]):
    # TODO: Check if Array.create_from is properly constrained // if I dont put reference, it's just from the mp-spdz doc itself
    data = Array.create_from(data)
    # TODO: Check if there's a need to use sint(0), can we just use 0? would that violate constraint?
    median_odd = sint(0)
    median_even = sint(0)
    data.sort()
    size = sum(if_else(i!= MAGIC_NUMBER, 1, 0) for i in data)

    # TODO: Check if for_range is any different than naive Python for-loop
    @for_range(len(data))
    def _(i):
        # TODO: Check if wrapping sint() makes sense/ properly constrained
        # TODO: Check why we cannot just use size.int_div(2) -> it returns wrong result, so now we use the method below instead.
        median_odd.update(median_odd+(size==2*sint(i)+size%2)*data[i])
        # TODO: Check if there's the need to use update: See example in Compiler.library.for_range(start, stop=None, step=None) in the mp-spdz doc itself
        median_even.update(median_even+(size==2*sint(i)+size%2)*data[i]/2+(size-2==2*sint(i)+size%2)*data[i]/2)
    # TODO: Check if size%2 is properly constrained
    return (size%2)*median_odd + (1-size%2)*median_even


def join(data1: Matrix, data2: Matrix, data1_column_index: int, data2_column_index: int) -> Matrix:
    """
    Join two matrices based on the matching index in the specified columns.

    :param data1: The first matrix
    :param data2: The second matrix
    :param data1_column_index: The column index in data1 to match with data2_column_index
    :param data2_column_index: The column index in data2 to match with data1_column_index

    For example, if data1 = [
        [0, 1, 2, 3],
        [152, 160, 170, 180]
    ], data2 = [
        [3, 0, 4],
        [50, 60, 70],
    ], data1_column_index = 0, data2_column_index = 0, then the output will be [
        [0, 1, 2, 3],
        [152, 160, 170, 180],
        [0, MAGIC_NUMBER, MAGIC_NUMBER, 3],
        [60, MAGIC_NUMBER, MAGIC_NUMBER, 50],
    ]
    """
    # E.g. [2, 4]
    num_columns_1 = data1.shape[0]
    num_rows_1 = data1.shape[1]

    # E.g. [2, 3]
    num_columns_2 = data2.shape[0]
    num_rows_2 = data2.shape[1]

    new_data = Matrix(num_columns_1 + num_columns_2, num_rows_1, sint)
    # Initialize the first part of the matrix with data1
    for i in range(num_columns_1):
        for j in range(num_rows_1):
            new_data[i][j] = data1[i][j]
    # Initialize the rest of the matrix with MAGIC_NUMBER
    for i in range(num_columns_2):
        for j in range(num_rows_1):
            new_data[num_columns_2 + i][j] = MAGIC_NUMBER

    # Check the matching index in data1 and data2
    for i in range(num_rows_1):
        # Find the corresponding index in data2[data2_column] for data1[data1_column][i]
        id_in_data1 = data1[data1_column_index][i]
        for j in range(num_rows_2):
            # Now checking if data2[data2_column][j] is the same as data1[data1_column][i]
            id_in_data2 = data2[data2_column_index][j]
            match = id_in_data1 == id_in_data2
            # If the match is found, set the entire row of data2[data2_column] to the new_data
            for k in range(num_columns_2):
                new_data[num_columns_1 + k][i] = if_else(
                    match,
                    data2[k][j],
                    new_data[num_columns_1 + k][i]
                )
    return new_data


def covariance(data1: list[sint], data2: list[sint]):
    n = len(data1)
    total1 = sum(if_else(i!= MAGIC_NUMBER, i, 0) for i in data1)
    total2 = sum(if_else(i!= MAGIC_NUMBER, i, 0) for i in data2)
    count = sum(if_else(i!= MAGIC_NUMBER, 1, 0) for i in data1)
    mean1 = total1/count
    mean2 = total2/count
    data1 = Array.create_from(if_else(i!=MAGIC_NUMBER, i, mean1) for i in data1)
    data2 = Array.create_from(if_else(i!=MAGIC_NUMBER, i, mean2) for i in data2)
    # TODO: Check if there's a need to use sfloat(0), can we do something like 0.0
    x = sfloat(0)
    @for_range(n)
    def _(i):
        x.update(x+(data1[i]-mean1)*(data2[i]-mean2))
    return x/(count-1)


def correlation(data1: list[sint], data2: list[sint]):
    n = len(data1)
    total1 = sum(if_else(i!= MAGIC_NUMBER, i, 0) for i in data1)
    total2 = sum(if_else(i!= MAGIC_NUMBER, i, 0) for i in data2)
    count = sum(if_else(i!= MAGIC_NUMBER, 1, 0) for i in data1)
    mean1 = total1/count
    mean2 = total2/count
    data1 = Array.create_from(if_else(i!=MAGIC_NUMBER, i, mean1) for i in data1)
    data2 = Array.create_from(if_else(i!=MAGIC_NUMBER, i, mean2) for i in data2)
    numerator = sfloat(0)
    denominator1 = sfloat(0)
    denominator2 = sfloat(0)
    @for_range(n)
    def _(i):
        numerator.update(numerator+(data1[i]-mean1)*(data2[i]-mean2))
        denominator1.update(denominator1+(data1[i]-mean1).square())
        denominator2.update(denominator2+(data2[i]-mean2).square())
    # Check if wrapping sfix() is properly constrainted.
    return numerator/(sqrt(sfix(denominator1))*sqrt(sfix(denominator2)))


def where(_filter: list[sint], data: list[sint]):
    n = len(data)
    data = Array.create_from(data)
    _filter = Array.create_from(_filter)
    res = sint.Array(n)
    @for_range(n)
    def _(i):
        res[i] = if_else(_filter[i], data[i], MAGIC_NUMBER)
    return res


def geometric_mean(data: list[sint]):
    log_sum = sum(if_else(i != MAGIC_NUMBER, log2_fx(i), 0) for i in data)
    num_log_sums = sum(if_else(i != MAGIC_NUMBER, 1, 0) for i in data)
    exponent = log_sum / num_log_sums

    return exp2_fx(exponent)

# LATER

def harmonic_mean(data: list[sint]):
    # TODO: implement harmonic_mean
    raise NotImplementedError


def mode(data: list[sint]):
    # TODO: implement mode
    raise NotImplementedError


def pstdev(data: list[sint]):
    # TODO: implement pstdev
    raise NotImplementedError


def pvariance(data: list[sint]):
    # TODO: implement pvariance
    raise NotImplementedError


def stdev(data: list[sint]):
    # TODO: implement stdev
    raise NotImplementedError


def variance(data: list[sint]):
    # TODO: implement variance
    raise NotImplementedError


def linear_regression(data1: list[sint], data2: list[sint]):
    # TODO: implement linear_regression
    raise NotImplementedError