The following IPython notebook summarizes the main functions used for clustering the auction graph using the R-LDG algorithm, running the auction simulator, and implementing the experiment of experiments framework.
import ast
import collections
import numpy as np
import os
import pandas
import tqdm # a progress-bar package.
def get_global_reserves(reserves, assignments):
# reserves.shape = (16268, 2)
return reserves[range(len(reserves)), assignments]
def get_shifts(ctr_func):
# pos_{j-1} - pos_j
shifts = np.zeros(len(ctr_func))
shifts[:-1] = ctr_func[:-1] - ctr_func[1:]
return shifts
def get_parts(bids, reserves):
meets_reserve_idx = (bids > reserves).nonzero()
part_bids = bids[meets_reserve_idx]
part_reserves = reserves[meets_reserve_idx]
return meets_reserve_idx, part_bids, part_reserves
def get_payments(bids, shifts, reserves):
# sorted in decreasing order!
n_bidders = len(bids)
pre_payments = np.zeros(n_bidders)
sum_items = shifts[:n_bidders-1] * bids[1:]
pre_payments[:n_bidders-1] = np.cumsum(sum_items[::-1])[::-1]
payments = np.maximum(pre_payments, reserves)
return payments
def get_utilities(bids, payments, ctr_func):
return (bids - payments) * ctr_func[:len(bids)]
def run_auction(bids, reserves, ctr_func, shifts):
utilities = np.zeros(len(bids))
indices, pbids, preserves = get_parts(bids, reserves)
if len(pbids) != 0:
payments = get_payments(pbids, shifts, preserves)
putils = get_utilities(pbids, payments, ctr_func)
utilities[indices] = putils
return utilitiesdef get_global_assignment(n_clusters,
clustering,
prop_treated):
A = np.zeros(n_clusters, dtype=int)
A[:int(n_clusters * prop_treated)] = 1
np.random.shuffle(A)
return A[clustering]
def get_global_assignment(n_clusters,
clustering,
prop_treated):
A = np.zeros(n_clusters, dtype=int)
A[:int(n_clusters * prop_treated)] = 1
np.random.shuffle(A)
return A[clustering]
def get_EofE_assignment(clustering_1,
nbr_clusters_1,
clustering_2,
nbr_clusters_2,
prop_treated):
W = np.zeros(len(clustering_1), dtype=np.int32)
W[:int(len(clustering_1)/2)] = 1
np.random.shuffle(W)
cl_1_indexes = W.nonzero()
cl_2_indexes = (1 - W).nonzero()
Z1 = get_global_assignment(nbr_clusters_1,
clustering_1[cl_1_indexes],
prop_treated)
Z2 = get_global_assignment(nbr_clusters_2,
clustering_2[cl_2_indexes],
prop_treated)
Z = np.zeros(len(clustering_1), dtype=np.int32)
Z[cl_1_indexes] = Z1
Z[cl_2_indexes] = Z2
return Z, W
def get_util_estimate(utilities, assignments):
t_mean = np.mean(utilities[assignments == 1])
c_mean = np.mean(utilities[assignments == 0])
return t_mean - c_mean
def analyse_EofE(utilities, Z, W):
u_t1 = utilities[np.logical_and(Z == 1, W == 1)]
u_c1 = utilities[np.logical_and(Z == 0, W == 1)]
u_t2 = utilities[np.logical_and(Z == 1, W == 0)]
u_c2 = utilities[np.logical_and(Z == 0, W == 0)]
# multiplied by 4 because M/m = 2 and len(Z) = 2 * N
est1 = 4 * (np.sum(u_t1) - np.sum(u_c1)) / (len(Z))
est2 = 4 * (np.sum(u_t2) - np.sum(u_c2)) / (len(Z))
return est1, est2
Note: to achieve significant speedup, we assume that bids (and bidders) are sorted in decreasing order!
class Auction(object):
""""
Main auction class
bids: list of lists. First level represents each individual auction.
The second level represents the bids in decreasing order.
Ex: [[10.4, 3.5, 2.1], [9.2, 1.0]] represents two auctions,
with 3 and 2 bids respectively.
bidders: list of lists. IDs of bidders making the bids as listed
above. Ex: [[1, 7, 2], [90, 2]] represents two auctions,
with 3 and 2 bidders respectively. Bidder 2 bids each time.
clustering_{}_f: clustering_file as provided by the clustering
functions above, corresponding to an array of length
number_of_bidders with each entry representing the cluster
ID of that bidder.
n_auctions: maximum number of auctions to run.
min/max_reserve: reserve is drawn uniformly from this segment for
each bidder.
n_bidders: number of bidders
prop_treated: proportion of bidder clusters to assign to the
intervention.
"""
def __init__(self,
bids,
bidders,
clustering_1_f=None,
clustering_2_f=None,
n_auctions=None,
min_reserve=100,
max_reserve=500,
n_bidders=16268,
prop_treated=.5):
self.bidders= bidders
self.bids = bids
if n_auctions is None:
self.n_auctions = len(bidders)
else:
self.n_auctions = n_auctions
self.n_bidders = n_bidders
assert len(bidders) == len(bids)
if clustering_1_f is not None:
self.clustering_1 = np.load(file=clustering_1_f)
self.n_cluster_1 = len(np.bincount(self.clustering_1))
if clustering_2_f is not None:
self.clustering_2 = np.load(file=clustering_2_f)
self.n_cluster_2 = len(np.bincount(self.clustering_2))
self.ctr_func = np.zeros(11)
self.ctr_func[:5] = np.array([.022, .011, .0075, .005, .003])
self.shifts = get_shifts(self.ctr_func)
self.reserve_arr = np.zeros((n_bidders, 2))
self.reserve_arr[:, 1] = np.random.randint(
min_reserve, max_reserve, size=n_bidders)
self.prop_treated = .5
def get_ground_truth(self):
tutils = np.zeros(self.n_bidders)
cutils = np.zeros(self.n_bidders)
for _, bid, bidder in zip(range(self.n_auctions),
self.bids, self.bidders):
if len(bid) > 1 and len(bid) < len(self.ctr_func):
t_reserve_arr = self.reserve_arr[bidder, 1]
tutils[bidder] += run_auction(bid,
t_reserve_arr,
self.ctr_func,
self.shifts)
c_reserve_arr = self.reserve_arr[bidder, 0]
cutils[bidder] += run_auction(bid,
c_reserve_arr,
self.ctr_func,
self.shifts)
return np.mean(tutils - cutils)
def run_experiment(self, cluster_idx, n_samples):
if cluster_idx == 1:
n_clusters = self.n_cluster_1
clustering = self.clustering_1
else:
n_clusters = self.n_cluster_2
clustering = self.clustering_2
assert len(clustering) == self.n_bidders
estimates = np.zeros(n_samples)
for i in tqdm.tqdm(range(n_samples)):
Z = get_global_assignment(n_clusters,
clustering,
self.prop_treated)
global_reserves = get_global_reserves(self.reserve_arr, Z)
utilities = np.zeros(self.n_bidders)
for _, bid, bidder in zip(range(self.n_auctions),
self.bids,
self.bidders):
if len(bid) > 1 and len(bid) < len(self.ctr_func):
reserves = global_reserves[bidder]
utilities[bidder] += run_auction(bid,
reserves,
self.ctr_func,
self.shifts)
estimates[i] = get_util_estimate(utilities, Z)
return estimates
def run_EofE(self, n_samples):
estimates = np.zeros([n_samples, 2])
for i in tqdm.tqdm(range(n_samples)):
Z, W = get_EofE_assignment(self.clustering_1,
self.n_cluster_1,
self.clustering_2,
self.n_cluster_2,
self.prop_treated)
global_reserves = get_global_reserves(self.reserve_arr, Z)
utilities = np.zeros(self.n_bidders)
for _, bid, bidder in zip(range(self.n_auctions),
self.bids,
self.bidders):
if len(bid) > 1 and len(bid) < len(self.ctr_func):
reserves = global_reserves[bidder]
utilities[bidder] += run_auction(bid,
reserves,
self.ctr_func,
self.shifts)
estimates[i] = analyse_EofE(utilities, Z, W)
return estimates
def plot_treatment_dist(self, cluster_idx, n_samples):
if cluster_idx == 1:
n_clusters = self.n_cluster_1
clustering = self.clustering_1
else:
n_clusters = self.n_cluster_2
clustering = self.clustering_2
S = []
for i in tqdm.tqdm(range(n_samples)):
Z = get_global_assignment(n_clusters,
clustering,
.5)
for _, bid, bidder in zip(range(100000), bids, bidders):
S.append(np.mean(Z[bidder]))
return S
Shuffling nodes helps with clustering:
the_shuffle = np.arange(NUM_NODES)
np.random.shuffle(the_shuffle)
Note: the following code is a modification of Justin Vincent's code: http://algorithmshop.com/20131213-graph-partitioning.html
def to_undirected(edge_iterable, num_edges, num_nodes, shuffle=None):
"""Takes an iterable of edges and produces the list of
edges for the undirected graph.
> to_undirected([[0,1],[1,2],[2,10]], 3, 11)
array([[ 0, 1],
[ 1, 0],
[ 1, 2],
[ 2, 1],
[ 2, 10],
[10, 2]])
"""
# need int64 to do gross bithacks
as_array = np.zeros((num_edges, 2), dtype=np.int64)
for (i, (n_0, n_1, weight)) in enumerate(edge_iterable):
as_array[i,0] = n_0
as_array[i,1] = n_1
# The graph is directed, but we want to make it undirected,
# which means we will duplicate some rows.
left_nodes = as_array[:,0]
right_nodes = as_array[:,1]
if shuffle is not None:
left_nodes = shuffle.take(left_nodes)
right_nodes = shuffle.take(right_nodes)
# numpy.unique will not unique whole rows, so this little bit-hacking
# is a quick way to get unique rows after making a flipped copy of
# each edge.
max_bits = int(np.ceil(np.log2(num_nodes + 1)))
encoded_edges_forward = np.left_shift(left_nodes, max_bits) \
| right_nodes
# Flip the columns and do it again:
encoded_edges_reverse = np.left_shift(right_nodes, max_bits) \
| left_nodes
unique_encoded_edges = np.unique(np.hstack((encoded_edges_forward,
encoded_edges_reverse)))
left_node_decoded = np.right_shift(unique_encoded_edges, max_bits)
# Mask out the high order bits
right_node_decoded = (2 ** (max_bits) - 1) & unique_encoded_edges
undirected_edges = np.vstack((left_node_decoded,
right_node_decoded)).T.astype(np.int32)
# ascontiguousarray so that it's c-contiguous for cython code below
return np.ascontiguousarray(undirected_edges)
def get_weights(edges, edge_iterable, shuffle=None):
graph_dict = {}
weights = np.zeros(len(edges), dtype=np.float32)
for (node_1, node_2, weight) in edge_iterable:
if shuffle is not None:
node_1, node_2 = shuffle.take([node_1, node_2])
graph_dict[tuple([node_1, node_2])] = weight
graph_dict[tuple([node_2, node_1])] = weight
for (node_1, node_2), idx in zip(edges, range(len(weights))):
if node_1 != node_2:
weights[idx] = graph_dict[tuple([node_1, node_2])]
return weights
def get_clean_data(shuffle=None):
edges = to_undirected(row_generator(),
NUM_EDGES,
NUM_NODES,
shuffle)
weights = get_weights(edges,
row_generator(),
shuffle)
return edges, weights
def row_generator():
return iterate_through_edges(bids, bidders)
print("Normalizing data")
%time edges, weights = get_clean_data(the_shuffle)
print('\nEDGES SHAPE: {}'.format(edges.shape))
print('\nWEIGHTS SHAPE: {}'.format(weights.shape))
def score(assignment, edges, weights):
"""Compute the score given an assignment of vertices.
N nodes are assigned to clusters 0 to K-1.
assignment: Vector where N[i] is the cluster node i is assigned to.
edges: The edges in the graph, assumed to have one in each direction
Returns: (total wasted bin space, ratio of edges cut)
"""
balance = np.bincount(assignment) / len(assignment)
waste = (np.max(balance) - balance).sum()
mismatch = 0
for (node_1, node_2), weight in zip(edges, weights):
if assignment[node_1] != assignment[node_2]:
mismatch += weight
total_weight = weights.sum()
cut_ratio = mismatch / total_weight
return (waste, cut_ratio)
%load_ext cython
%pylab inline
%%cython
import numpy as np
cimport cython
cdef int UNMAPPED = -1
def linear_deterministic_greedy(int[:,::] edges,
float[::] weights,
int num_nodes,
int num_cust_nodes,
int num_partitions,
int[::] partition,
int[::] is_customer_node):
"""
This algorithm favors a cluster if it has many neighbors of a node,
but penalizes the cluster if it is close to capacity.
edges: An [:,2] array of edges.
num_nodes: The number of nodes in the graph.
num_partitions: How many partitions we are breaking the graph into.
partition: The partition from a previous run. Used for restreaming.
Returns: A new partition.
"""
assert len(weights) == len(edges)
# The output partition
if partition is None:
partition = np.repeat(np.int32(UNMAPPED), num_nodes)
cdef int[::] partition_sizes = np.zeros(num_partitions,
dtype=np.int32)
cdef float[::] partition_votes = np.zeros(num_partitions,
dtype=np.float32)
# Fine to be a little off, to stay integers
cdef int partition_capacity = num_cust_nodes / num_partitions
cdef int last_left = edges[0,0]
cdef int i = 0
cdef int left = 0
cdef int right = 0
cdef int arg = 0
cdef int max_arg = 0
cdef float max_val = 0
cdef float val = 0
cdef int len_edges = len(edges)
cdef float weight = 0
for i in range(len_edges):
left = edges[i,0]
right = edges[i,1]
weight = weights[i]
if last_left != left:
# We have found a new node so assign last_left to a partition
max_arg = 0
max_val = (partition_votes[0]) * (
partition_capacity - partition_sizes[0])
for arg in range(1, num_partitions):
val = (partition_votes[arg]) * (
partition_capacity - partition_sizes[arg])
if val > max_val:
max_arg = arg
max_val = val
if max_val == 0:
max_arg = arg
# No neighbors (or multiple maxed out) so "randomly" select
# the smallest partition
for arg in range(i % num_partitions, num_partitions):
if partition_sizes[arg] < partition_capacity:
max_arg = arg
max_val = 1
break
if max_val == 0:
for arg in range(0, i % num_partitions):
if partition_sizes[arg] < partition_capacity:
max_arg = arg
break
if is_customer_node[last_left]:
partition_sizes[max_arg] += 1
partition[last_left] = max_arg
partition_votes[:] = 0
last_left = left
if partition[right] != UNMAPPED:
partition_votes[partition[right]] += weight
# Clean up the last assignment
max_arg = 0
max_val = 0
for arg in range(0, num_partitions):
if partition_sizes[arg] < partition_capacity:
val = (partition_votes[arg]) * (
1 - partition_sizes[arg] / partition_capacity)
if val > max_val:
max_arg = arg
max_val = val
partition[left] = max_arg
return np.asarray(partition)
def run_restreaming_greedy(edges,
weights,
num_nodes,
num_cust_nodes,
num_partitions,
num_iterations,
is_customer_node):
print('\n{} PARTITIONS'.format(num_partitions))
assignments = None
print('ROUND\tWASTE\tSCORE')
waste_values = []
edge_score_values = []
#flipped_edges = numpy.flipud(edges).copy()
for i in range(num_iterations):
assignments = linear_deterministic_greedy(edges,
weights,
num_nodes,
num_cust_nodes,
num_partitions,
assignments,
is_customer_node)
(waste, edge_score) = score(assignments, edges, weights)
waste_values.append(waste)
edge_score_values.append(edge_score)
print('{}\t{:0.3f}\t{:0.3f}'.format(i, waste, edge_score))
return assignments, waste_values, edge_score_values
We have to keep track of which nodes are customer nodes to ensure balance on the customer-side of the graph!
# NUM_CUST_NODES = 16268
is_customer_node = np.zeros(NUM_NODES, dtype=np.int32)
is_customer_node[:NUM_CUST_NODES] = 1
is_customer_node = is_customer_node[the_shuffle]
Run the algorithm:
rval = run_restreaming_greedy(edges,
weights,
NUM_NODES,
NUM_CUST_NODES,
NUM_PARTITIONS,
num_iterations=10,
is_customer_node=is_customer_node)
assignments, waste_values, edge_score_values = rval
Unshuffle nodes:
idx = np.argsort(np.argsort(the_shuffle))
rval = assignments[idx]
Get a random clustering:
def get_random_partition(n_partitions,
n_nodes,
n_cust_nodes):
partition_capacity = int(n_cust_nodes / n_partitions)
cust_partition = np.concatenate([[i] * partition_capacity for
i in range(n_partitions)])
np.random.shuffle(cust_partition)
partition = np.zeros(n_nodes, dtype=np.int32)
partition[:len(cust_partition)] = cust_partition
partition[len(cust_partition):] = np.random.randint(
0, n_partitions, size=n_nodes - len(cust_partition))
return partition
random_partition = get_random_partition(NUM_PARTITIONS,
NUM_NODES, NUM_CUST_NODES)