function [ cost, deltas, acc ] = grad_hid_disc2(hid1, hid2, lab1, lab2, pars)

deltas = cell(2,1);
deltas{1} = 0*hid1;
deltas{2} = 0*hid2;

lab1 = lab1+1;
lab2 = lab2+1;

idx = 1;
cost = 0;
acc = 0;
for l = 1:length(pars.card),
    curidx = idx:(idx + pars.card(l) - 1);
    L = repmat(oneofc(lab1(l), pars.card(l)), 1, pars.batchsize);
    [ cost_l, delta_ref, acc1 ] = grad_softmax(L, hid1(curidx, :), pars);
    cost = cost + cost_l;
    deltas{1}(curidx,:) = deltas{1}(curidx,:) + delta_ref;

    L = repmat(oneofc(lab2(l), pars.card(l)), 1, pars.batchsize);
    [ cost_l, delta_out, acc2 ] = grad_softmax(L, hid2(curidx, :), pars);
    cost = cost + cost_l;
    deltas{2}(curidx,:) = deltas{2}(curidx,:) + delta_out;

    idx = idx + pars.card(l);
    acc = acc + (acc1 + acc2);
end

acc = acc / (length(pars.card) * 2);

deltas{1} = pars.alpha*deltas{1};
deltas{2} = pars.alpha*deltas{2};

cost = pars.alpha * cost;

end

function [ cost, delta, acc ] = grad_softmax(L, P, pars)

P = exp(bsxfun(@minus, P, max(P, [], 1)));
pred = bsxfun(@rdivide, P, sum(P));
[~,pred_quantized] = max(pred,[],1);
[ yvec, ~] = find(L);
acc = mean(pred_quantized(:) == yvec(:));
cost = (-1/pars.batchsize) * sum(sum(L.*log(pred)));
delta = (1/pars.batchsize) * (pred - L);

end

