function [ cost, delta ] = grad_hid_disc4(hid, hid_target, L, pars)

delta = 0*hid;

idx2 = (pars.num_categorical + 1):size(hid,1);
err = hid(idx2,:) - hid_target(idx2,:);

cost = 0.5*pars.beta*(err(:)'*err(:));

L = L + 1;
idx = 1;
cost_pred = 0;
for l = 1:length(pars.card),
    curidx = idx:(idx + pars.card(l) - 1);
    Ltmp = repmat(oneofc(L(l), pars.card(l)), 1, pars.batchsize);
    [ cost_l, delta_pred, ~ ] = grad_softmax(Ltmp, hid(curidx, :), pars);
    cost_pred = cost_pred + pars.alpha*cost_l;
    delta(curidx,:) = delta(curidx,:) + pars.alpha*delta_pred;
    idx = idx + pars.card(l);
end

cost = cost + cost_pred;
delta(idx2,:) = pars.beta*err;

end

function [ cost, delta, acc ] = grad_softmax(L, P, pars)

P = exp(bsxfun(@minus, P, max(P, [], 1)));
pred = bsxfun(@rdivide, P, sum(P));
[~,pred_quantized] = max(pred,[],1);
[ yvec, ~] = find(L);
acc = mean(pred_quantized(:) == yvec(:));
cost = (-1/pars.batchsize) * sum(sum(L.*log(pred)));
delta = (1/pars.batchsize) * (pred - L);

end



