Monographs
Non-convex Optimization for Machine Learning
Prateek Jain and Purushottam Kar,
Foundations and Trends® in Machine Learning,
vol. 10,
December
2017.
[BibTeX]
[URL]
@article{JainK17,
author = {Prateek Jain and Purushottam Kar},
title = {Non-convex Optimization for Machine Learning},
journal = {Foundations and Trends® in Machine Learning},
year = {2017},
volume = {10},
url = {all_papers/JainK17_FTML.pdf}
}
Journal Publications
Partial Hard Thresholding
Prateek Jain, Ambuj Tewari and Inderjit S. Dhillon,
IEEE Trans. Information Theory,
vol. 63,
no. 5,
pp. 3029-3038,
2017.
[BibTeX]
[URL]
@article{JainTD17,
author = {Prateek Jain and Ambuj Tewari and Inderjit S. Dhillon},
title = {Partial Hard Thresholding},
journal = {IEEE Trans. Information Theory},
year = {2017},
volume = {63},
number = {5},
pages = {3029--3038},
url = {https://doi.org/10.1109/TIT.2017.2686880},
doi = {http://doi.org/10.1109/TIT.2017.2686880}
}
Learning Sparsely Used Overcomplete Dictionaries via Alternating Minimization
Alekh Agarwal, Animashree Anandkumar, Prateek Jain and Praneeth Netrapalli,
SIAM Journal on Optimization,
vol. 26,
no. 4,
pp. 2775-2799,
2016.
[BibTeX]
[URL]
@article{AgarwalAJN16,
author = {Alekh Agarwal and Animashree Anandkumar and Prateek Jain and Praneeth Netrapalli},
title = {Learning Sparsely Used Overcomplete Dictionaries via Alternating Minimization},
journal = {SIAM Journal on Optimization},
year = {2016},
volume = {26},
number = {4},
pages = {2775--2799},
url = {https://doi.org/10.1137/140979861},
doi = {http://doi.org/10.1137/140979861}
}
Phase Retrieval Using Alternating Minimization
Praneeth Netrapalli, Prateek Jain and Sujay Sanghavi,
IEEE Transactions on Signal Processing (ITSP),
vol. 63,
no. 18,
pp. 4814-4826,
2015.
[BibTeX]
[URL]
@article{NetrapalliJS15,
author = {Praneeth Netrapalli and Prateek Jain and Sujay Sanghavi},
title = {Phase Retrieval Using Alternating Minimization},
journal = {IEEE Transactions on Signal Processing (ITSP)},
year = {2015},
volume = {63},
number = {18},
pages = {4814--4826},
url = {http://dx.doi.org/10.1109/TSP.2015.2448516},
doi = {http://doi.org/10.1109/TSP.2015.2448516}
}
Hashing Hyperplane Queries to Near Points with Applications to Large-Scale Active Learning
Sudheendra Vijayanarasimhan, Prateek Jain and Kristen Grauman,
IEEE Trans. Pattern Anal. Mach. Intell.,
vol. 36,
no. 2,
pp. 276-288,
2014.
[BibTeX]
[URL]
@article{Vijayanarasimhan0G14,
author = {Sudheendra Vijayanarasimhan and Prateek Jain and Kristen Grauman},
title = {Hashing Hyperplane Queries to Near Points with Applications to Large-Scale Active Learning},
journal = {IEEE Trans. Pattern Anal. Mach. Intell.},
year = {2014},
volume = {36},
number = {2},
pages = {276--288},
url = {http://doi.ieeecomputersociety.org/10.1109/TPAMI.2013.121},
doi = {http://doi.org/10.1109/TPAMI.2013.121}
}
Improved Multiple Sequence Alignments Using Coupled Pattern Mining
K. S. M. Tozammel Hossain, Debprakash Patnaik, Srivatsan Laxman, Prateek Jain, Chris Bailey-Kellogg and Naren Ramakrishnan,
IEEE/ACM Trans. Comput. Biology Bioinform.,
vol. 10,
no. 5,
pp. 1098-1112,
2013.
[BibTeX]
[URL]
@article{HossainPL0BR13,
author = {K. S. M. Tozammel Hossain and Debprakash Patnaik and Srivatsan Laxman and Prateek Jain and Chris Bailey-Kellogg and Naren Ramakrishnan},
title = {Improved Multiple Sequence Alignments Using Coupled Pattern Mining},
journal = {IEEE/ACM Trans. Comput. Biology Bioinform.},
year = {2013},
volume = {10},
number = {5},
pages = {1098--1112},
url = {http://dx.doi.org/10.1109/TCBB.2013.36},
doi = {http://doi.org/10.1109/TCBB.2013.36}
}
Metric and Kernel Learning Using a Linear Transformation
Prateek Jain, Brian Kulis, Jason V. Davis and Inderjit S. Dhillon,
Journal of Machine Learning Research,
vol. 13,
pp. 519-547,
2012.
[BibTeX]
[URL]
@article{JainKDD12,
author = {Prateek Jain and Brian Kulis and Jason V. Davis and Inderjit S. Dhillon},
title = {Metric and Kernel Learning Using a Linear Transformation},
journal = {Journal of Machine Learning Research},
year = {2012},
volume = {13},
pages = {519--547},
url = {http://dl.acm.org/citation.cfm?id=2188402}
}
Fast Similarity Search for Learned Metrics
Brian Kulis, Prateek Jain and Kristen Grauman,
IEEE Trans. Pattern Anal. Mach. Intell.,
vol. 31,
no. 12,
pp. 2143-2157,
2009.
[BibTeX]
[URL]
@article{KulisJG09,
author = {Brian Kulis and Prateek Jain and Kristen Grauman},
title = {Fast Similarity Search for Learned Metrics},
journal = {IEEE Trans. Pattern Anal. Mach. Intell.},
year = {2009},
volume = {31},
number = {12},
pages = {2143--2157},
url = {http://doi.ieeecomputersociety.org/10.1109/TPAMI.2009.151},
doi = {http://doi.org/10.1109/TPAMI.2009.151}
}
Simultaneous Unsupervised Learning of Disparate Clusterings
Prateek Jain, Raghu Meka and Inderjit S. Dhillon,
Statistical Analysis and Data Mining,
vol. 1,
no. 3,
pp. 195-210,
2008.
[BibTeX]
[URL]
@article{JainMD08b,
author = {Prateek Jain and Raghu Meka and Inderjit S. Dhillon},
title = {Simultaneous Unsupervised Learning of Disparate Clusterings},
journal = {Statistical Analysis and Data Mining},
year = {2008},
volume = {1},
number = {3},
pages = {195--210},
url = {http://dx.doi.org/10.1002/sam.10007},
doi = {http://doi.org/10.1002/sam.10007}
}
Conference Publications
(Nearly) Optimal Private Linear Regression for Sub-Gaussian Data via Adaptive Clipping
Prateek Varshney, Abhradeep Thakurta and Prateek Jain,
in Conference on Learning Theory, 2-5 July 2022, London, UK,
2022.
[BibTeX]
[Abstract]
[URL]
We study the problem of differentially private linear regression where each of the data point is sampled from a fixed sub-Gaussian style distribution. We propose and analyze a one-pass mini-batch stochastic gradient descent method (DP-AMBSSGD) where points in each iteration are sampled without replacement. Noise is added for DP but the noise standard deviation is estimated online. Compared to existing (ϵ,δ)-DP techniques which have sub-optimal error bounds, DP-AMBSSGD is able to provide nearly optimal error bounds in terms of key parameters like dimensionality d, number of points N, and the standard deviation \sigma of the noise in observations. For example, when the d-dimensional covariates are sampled i.i.d. from the normal distribution, then the excess error of DP-AMBSSGD due to privacy is σ2d/N(1+d/(ϵ2N)), i.e., the error is meaningful when number of samples N\geq d \log d which is the standard operative regime for linear regression. In contrast, error bounds for existing efficient methods in this setting are: d3/(ϵ2N2), even for σ=0. That is, for constant ϵ, the existing techniques require N=d1.5 to provide a non-trivial result.
@inproceedings{VarshneyTJ2022,
author = {Prateek Varshney and Abhradeep Thakurta and Prateek Jain},
title = {(Nearly) Optimal Private Linear Regression for Sub-Gaussian Data via Adaptive Clipping},
booktitle = {Conference on Learning Theory, 2-5 July 2022, London, UK},
publisher = {PMLR},
year = {2022},
volume = {178},
pages = {1126--1166},
url = {https://proceedings.mlr.press/v178/varshney22a.html}
}
Robust Training in High Dimensions via Block Coordinate Geometric Median Descent
Anish Acharya, Abolfazl Hashemi, Prateek Jain, Sujay Sanghavi, Inderjit S. Dhillon and Ufuk Topcu,
in International Conference on Artificial Intelligence and Statistics, AISTATS 2022, 28-30 March 2022, Virtual Event,
2022.
[BibTeX]
[Abstract]
[URL]
Geometric median (GM) is a classical method in statistics for achieving robust estimation of the uncorrupted data; under gross corruption, it achieves the optimal breakdown point of 1/2. However, its computational complexity makes it infeasible for robustifying stochastic gradient descent (SGD) in high-dimensional optimization problems. In this paper, we show that by applying GM to only a judiciously chosen block of coordinates at a time and using a memory mechanism, one can retain the breakdown point of 1/2 for smooth non-convex problems, with non-asymptotic convergence rates comparable to the SGD with GM while resulting in significant speedup in training. We further validate the run-time and robustness of our approach empirically on several popular deep learning tasks. Code available at: https://github.com/anishacharya/BGMD
@inproceedings{AcharyaHJSDT2022,
author = {Anish Acharya and Abolfazl Hashemi and Prateek Jain and Sujay Sanghavi and Inderjit S. Dhillon and Ufuk Topcu},
title = {Robust Training in High Dimensions via Block Coordinate Geometric Median Descent},
booktitle = {International Conference on Artificial Intelligence and Statistics, AISTATS 2022, 28-30 March 2022, Virtual Event},
publisher = {PMLR},
year = {2022},
volume = {151},
pages = {11145--11168},
url = {https://proceedings.mlr.press/v151/acharya22a.html}
}
Differentially Private Model Personalization
Prateek Jain, John Rush, Adam D. Smith, Shuang Song and Abhradeep Guha Thakurta,
in Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual,
2021.
[BibTeX]
[URL]
Near-optimal Offline and Streaming Algorithms for Learning Non-Linear Dynamical Systems
Suhas S. Kowshik, Dheeraj Nagaraj, Prateek Jain and Praneeth Netrapalli,
in Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual,
2021.
[BibTeX]
[Abstract]
[URL]
We consider the setting of vector valued non-linear dynamical systems. While the problem is well-studied in the linear case, where is identity, with optimal error rates even for non-mixing systems, existing results in the non-linear case hold only for mixing systems. In this work, we improve existing results for learning nonlinear systems in a number of ways: a) we provide the first offline algorithm that can learn non-linear dynamical systems without the mixing assumption, b) we significantly improve upon the sample complexity of existing results for mixing systems, c) in the much harder one-pass, streaming setting we study a SGD with Reverse Experience Replay (SGD-RER) method, and demonstrate that for mixing systems, it achieves the same sample complexity as our offline algorithm, d) we justify the expansivity assumption by showing that for the popular ReLU link function --- a non-expansive but easy to learn link function with i.i.d. samples --- any method would require exponentially many samples (with respect to dimension of input) from the dynamical system. We validate our results via. simulations and demonstrate that a naive application of SGD can be highly sub-optimal. Indeed, our work demonstrates that for correlated data, specialized methods designed for the dependency structure in data can significantly outperform standard SGD based methods.
@inproceedings{KowshikNJN2021,
author = {Suhas S. Kowshik and Dheeraj Nagaraj and Prateek Jain and Praneeth Netrapalli},
title = {Near-optimal Offline and Streaming Algorithms for Learning Non-Linear Dynamical Systems},
booktitle = {Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual},
year = {2021},
pages = {8518--8531},
url = {https://proceedings.neurips.cc/paper/2021/hash/47a658229eb2368a99f1d032c8848542-Abstract.html}
}
LLC: Accurate, Multi-purpose Learnt Low-dimensional Binary Codes
Aditya Kusupati, Matthew Wallingford, Vivek Ramanujan, Raghav Somani, Jae Sung Park, Krishna Pillutla, Prateek Jain, Sham M. Kakade and Ali Farhadi,
in Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual,
2021.
[BibTeX]
[Abstract]
[URL]
Learning binary representations of instances and classes is a classical problem with several high potential applications. In modern settings, the compression of high-dimensional neural representations to low-dimensional binary codes is a challenging task and often require large bit-codes to be accurate. In this work, we propose a novel method for Learning Low-dimensional binary Codes
(LLC)for instances as well as classes. Our method does not require any side-information, like annotated attributes or label meta-data, and learns extremely low-dimensional binary codes (≈20 bits for ImageNet-1K). The learnt codes are super-efficient while still ensuring
nearly optimal classification accuracy for ResNet50 on ImageNet-1K. We demonstrate that the learnt codes capture intrinsically important features in the data, by discovering an intuitive taxonomy over classes. We further quantitatively measure the quality of our codes by applying it to the efficient image retrieval as well as out-of-distribution (OOD) detection problems. For ImageNet-100 retrieval problem, our learnt binary codes outperform 16 bit HashNet using only 10 bits and also are as accurate as 10 dimensional real representations. Finally, our learnt binary codes can perform OOD detection, out-of-the-box, as accurately as a baseline that needs ≈3000 samples to tune its threshold, while we require none. Code is open-sourced at https://github.com/RAIVNLab/LLC.
@inproceedings{KusupatiWRSPPJK2021,
author = {Aditya Kusupati and Matthew Wallingford and Vivek Ramanujan and Raghav Somani and Jae Sung Park and Krishna Pillutla and Prateek Jain and Sham M. Kakade and Ali Farhadi},
title = {LLC: Accurate, Multi-purpose Learnt Low-dimensional Binary Codes},
booktitle = {Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual},
year = {2021},
pages = {23900--23913},
url = {https://proceedings.neurips.cc/paper/2021/hash/c88d8d0a6097754525e02c2246d8d27f-Abstract.html}
}
Do Input Gradients Highlight Discriminative Features?
Harshay Shah, Prateek Jain and Praneeth Netrapalli,
in Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual,
2021.
[BibTeX]
[Abstract]
[URL]
Post-hoc gradient-based interpretability methods [Simonyan et al., 2013, Smilkov et al., 2017] that provide instance-specific explanations of model predictions are often based on assumption (A): magnitude of input gradients—gradients of logits with respect to input—noisily highlight discriminative task-relevant features. In this work, we test the validity of assumption (A) using a three-pronged approach:1. We develop an evaluation framework, DiffROAR, to test assumption (A) on four image classification benchmarks. Our results suggest that (i) input gradients of standard models (i.e., trained on original data) may grossly violate (A), whereas (ii) input gradients of adversarially robust models satisfy (A).2. We then introduce BlockMNIST, an MNIST-based semi-real dataset, that by design encodes a priori knowledge of discriminative features. Our analysis on BlockMNIST leverages this information to validate as well as characterize differences between input gradient attributions of standard and robust models.3. Finally, we theoretically prove that our empirical findings hold on a simplified version of the BlockMNIST dataset. Specifically, we prove that input gradients of standard one-hidden-layer MLPs trained on this dataset do not highlight instance-specific signal coordinates, thus grossly violating assumption (A).Our findings motivate the need to formalize and test common assumptions in interpretability in a falsifiable manner [Leavitt and Morcos, 2020]. We believe that the DiffROAR evaluation framework and BlockMNIST-based datasets can serve as sanity checks to audit instance-specific interpretability methods; code and data available at https://github.com/harshays/inputgradients.
@inproceedings{ShahJN2021,
author = {Harshay Shah and Prateek Jain and Praneeth Netrapalli},
title = {Do Input Gradients Highlight Discriminative Features?},
booktitle = {Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual},
year = {2021},
pages = {2046--2059},
url = {https://proceedings.neurips.cc/paper/2021/hash/0fe6a94848e5c68a54010b61b3e94b0e-Abstract.html}
}
Statistically and Computationally Efficient Linear Meta-representation Learning
Kiran Koshy Thekumparampil, Prateek Jain, Praneeth Netrapalli and Sewoong Oh,
in Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual,
2021.
[BibTeX]
[Abstract]
[URL]
In typical few-shot learning, each task is not equipped with enough data to be learned in isolation. To cope with such data scarcity, meta-representation learning methods train across many related tasks to find a shared (lower-dimensional) representation of the data where all tasks can be solved accurately. It is hypothesized that any new arriving tasks can be rapidly trained on this low-dimensional representation using only a few samples. Despite the practical successes of this approach, its statistical and computational properties are less understood. Moreover, the prescribed algorithms in these studies have little resemblance to those used in practice or they are computationally intractable. To understand and explain the success of popular meta-representation learning approaches such as ANIL, MetaOptNet, R2D2, and OML, we study a alternating gradient-descent minimization (AltMinGD) method (and its variant alternating minimization (AltMin)) which underlies the aforementioned methods. For a simple but canonical setting of shared linear representations, we show that AltMinGD achieves nearly-optimal estimation error, requiring only poly(log d) samples per task. This agrees with the observed efficacy of this algorithm in the practical few-shot learning scenarios.
@inproceedings{ThekumparampilJ2021,
author = {Kiran Koshy Thekumparampil and Prateek Jain and Praneeth Netrapalli and Sewoong Oh},
title = {Statistically and Computationally Efficient Linear Meta-representation Learning},
booktitle = {Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual},
year = {2021},
pages = {18487--18500},
url = {https://proceedings.neurips.cc/paper/2021/hash/99e7e6ce097324aceb45f98299ceb621-Abstract.html}
}
Private Alternating Least Squares: Practical Private Matrix Completion with Tighter Rates
Steve Chien, Prateek Jain, Walid Krichene, Steffen Rendle, Shuang Song, Abhradeep Thakurta and Li Zhang,
in Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event,
2021.
[BibTeX]
[Abstract]
[URL]
We study the problem of differentially private (DP) matrix completion under user-level privacy. We design a joint differentially private variant of the popular Alternating-Least-Squares (ALS) method that achieves: i) (nearly) optimal sample complexity for matrix completion (in terms of number of items, users), and ii) the best known privacy/utility trade-off both theoretically, as well as on benchmark data sets. In particular, we provide the first global convergence analysis of ALS with noise introduced to ensure DP, and show that, in comparison to the best known alternative (the Private Frank-Wolfe algorithm by Jain et al. (2018)), our error bounds scale significantly better with respect to the number of items and users, which is critical in practical problems. Extensive validation on standard benchmarks demonstrate that the algorithm, in combination with carefully designed sampling procedures, is significantly more accurate than existing techniques, thus promising to be the first practical DP embedding model.
@inproceedings{ChienJKR0T02021,
author = {Steve Chien and Prateek Jain and Walid Krichene and Steffen Rendle and Shuang Song and Abhradeep Thakurta and Li Zhang},
title = {Private Alternating Least Squares: Practical Private Matrix Completion with Tighter Rates},
booktitle = {Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event},
publisher = {PMLR},
year = {2021},
volume = {139},
pages = {1877--1887},
url = {http://proceedings.mlr.press/v139/chien21a.html}
}
Optimal regret algorithm for Pseudo-1d Bandit Convex Optimization
Aadirupa Saha, Nagarajan Natarajan, Praneeth Netrapalli and Prateek Jain,
in Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event,
2021.
[BibTeX]
[Abstract]
[URL]
We study online learning with bandit feedback (i.e. learner has access to only zeroth-order oracle) where cost/reward functions ft admit a "pseudo-1d" structure, i.e. ft(w)=losst(predt(w)) where the output of predt is one-dimensional. At each round, the learner observes context xt, plays prediction predt(wt;xt) (e.g. predt(⋅)=⟨xt,⋅⟩) for some wt and observes loss losst(predt(wt)) where losst is a convex Lipschitz-continuous function. The goal is to minimize the standard regret metric. This pseudo-1d bandit convex optimization problem (SBCO) arises frequently in domains such as online decision-making or parameter-tuning in large systems. For this problem, we first show a regret lower bound of min((dT)^1/2, T^3/4) for any algorithm, where T is the number of rounds. We propose a new algorithm that combines randomized online gradient descent with a kernelized exponential weights method to exploit the pseudo-1d structure effectively, guaranteeing the optimal regret bound mentioned above, up to additional logarithmic factors. In contrast, applying state-of-the-art online convex optimization methods leads to O(min(d^9.5 T^1/2,d^1/2 T^3/4)) regret, that is significantly suboptimal in terms of d.
@inproceedings{SahaNNJ2021,
author = {Aadirupa Saha and Nagarajan Natarajan and Praneeth Netrapalli and Prateek Jain},
title = {Optimal regret algorithm for Pseudo-1d Bandit Convex Optimization},
booktitle = {Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event},
publisher = {PMLR},
year = {2021},
volume = {139},
pages = {9255--9264},
url = {http://proceedings.mlr.press/v139/saha21c.html}
}
Streaming Linear System Identification with Reverse Experience Replay
Prateek Jain, Suhas S. Kowshik, Dheeraj Nagaraj and Praneeth Netrapalli,
in Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual,
2021.
[BibTeX]
[Abstract]
[URL]
We consider the problem of estimating a linear time-invariant (LTI) dynamical system from a single trajectory via streaming algorithms, which is encountered in several applications including reinforcement learning (RL) and time-series analysis. While the LTI system estimation problem is well-studied in the {\em offline} setting, the practically important streaming/online setting has received little attention. Standard streaming methods like stochastic gradient descent (SGD) are unlikely to work since streaming points can be highly correlated. In this work, we propose a novel streaming algorithm, SGD with Reverse Experience Replay (SGD-RER), that is inspired by the experience replay (ER) technique popular in the RL literature. SGD-RER divides data into small buffers and runs SGD backwards on the data stored in the individual buffers. We show that this algorithm exactly deconstructs the dependency structure and obtains information theoretically optimal guarantees for both parameter error and prediction error. Thus, we provide the first -- to the best of our knowledge -- optimal SGD-style algorithm for the classical problem of linear system identification with a first order oracle. Furthermore, SGD-RER can be applied to more general settings like sparse LTI identification with known sparsity pattern, and non-linear dynamical systems. Our work demonstrates that the knowledge of data dependency structure can aid us in designing statistically and computationally efficient algorithms which can decorrelate'' streaming samples.
@inproceedings{JainKNN2021,
author = {Prateek Jain and Suhas S. Kowshik and Dheeraj Nagaraj and Praneeth Netrapalli},
title = {Streaming Linear System Identification with Reverse Experience Replay},
booktitle = {Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual},
year = {2021},
pages = {30140--30152},
url = {https://proceedings.neurips.cc/paper/2021/hash/fd2c5e4680d9a01dba3aada5ece22270-Abstract.html}
}
Projection Efficient Subgradient Method and Optimal Nonsmooth Frank-Wolfe Method
Kiran Koshy Thekumparampil, Prateek Jain, Praneeth Netrapalli and Sewoong Oh,
in Proceedings of the 34th Conference on Advances in Neural Information Processing Systems (NeurIPS),
2020.
[BibTeX]
[Abstract]
[URL]
We consider the classical setting of optimizing a nonsmooth Lipschitz continuous convex function over a convex constraint set, when having access to a (stochastic) first-order oracle (FO) for the function and a projection oracle (PO) for the constraint set. It is well known that to achieve e-suboptimality in high-dimensions, Θ(ε-2) FO calls are necessary [64]. This is achieved by the projected subgradient method (PGD) [11]. However, PGD also entails O(ε-2) PO calls, which may be computationally costlier than FO calls (e.g. nuclear norm constraints). Improving this PO calls complexity of PGD is largely unexplored, despite the fundamental nature of this problem and extensive literature. We present first such improvement. This only requires a mild assumption that the objective function, when extended to a slightly larger neighborhood of the constraint set, still remains Lipschitz and accessible via FO. In particular, we introduce MOPES method, which carefully combines Moreau-Yosida smoothing and accelerated first-order schemes. This is guaranteed to find a feasible e-suboptimal solution using only O(ε-1) PO calls and optimal O(ε-2) FO calls. Further, instead of a PO if we only have a linear minimization oracle (LMO, à la Frank-Wolfe) to access the constraint set, an extension of our method, MOLES, finds afeasible e-suboptimal solution using O(ε-2) LMO calls and FO calls—both match known lower bounds [54], resolving a question left open since [84]. Our experiments confirm that these methods achieve significant speedups over the state-of-the-art, for a problem with costly PO and LMO calls.
@inproceedings{ThekumparampilJNO2020,
author = {Kiran Koshy Thekumparampil, Prateek Jain, Praneeth Netrapalli, Sewoong Oh},
title = {Projection Efficient Subgradient Method and Optimal Nonsmooth Frank-Wolfe Method},
booktitle = {Proceedings of the 34th Conference on Advances in Neural Information Processing Systems (NeurIPS)},
year = {2020},
url = {https://proceedings.neurips.cc/paper/2020/file/8f468c873a32bb0619eaeb2050ba45d1-Paper.pdf}
}
Least Squares Regression with Markovian Data: Fundamental Limits and Algorithms
Dheeraj Nagaraj, Xian Wu, Guy Bresler, Prateek Jain and Praneeth Netrapalli,
in Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual,
2020.
[BibTeX]
[Abstract]
[URL]
We study the problem of least squares linear regression where the datapoints are dependent and are sampled from a Markov chain. We establish sharp information theoretic minimax lower bounds for this problem in terms of
tmix
, the mixing time of the underlying Markov chain, under different noise settings. Our results establish that in general, optimization with Markovian data is strictly harder than optimization with independent data and a trivial algorithm (SGD-DD) that works with only one in every
tmix
samples, which are approximately independent, is minimax optimal. In fact, it is strictly better than the popular Stochastic Gradient Descent (SGD) method with constant step-size which is otherwise minimax optimal in the regression with independent data setting. Beyond a worst case analysis, we investigate whether structured datasets seen in practice such as Gaussian auto-regressive dynamics can admit more efficient optimization schemes. Surprisingly, even in this specific and natural setting, Stochastic Gradient Descent (SGD) with constant step-size is still no better than SGD-DD. Instead, we propose an algorithm based on experience replay--a popular reinforcement learning technique--that achieves a significantly better error rate. Our improved rate serves as one of the first results where an algorithm outperforms SGD-DD on an interesting Markov chain and also provides one of the first theoretical analyses to support the use of experience replay in practice.
@inproceedings{NagarajWBJN2020,
author = {Dheeraj Nagaraj and Xian Wu and Guy Bresler and Prateek Jain and Praneeth Netrapalli},
title = {Least Squares Regression with Markovian Data: Fundamental Limits and Algorithms},
booktitle = {Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual},
year = {2020},
url = {https://proceedings.neurips.cc/paper/2020/hash/c22abfa379f38b5b0411bc11fa9bf92f-Abstract.html}
}
RNNPool: Efficient Non-linear Pooling for RAM Constrained Inference
Oindrila Saha, Aditya Kusupati, Harsha Vardhan Simhadri, Manik Varma and Prateek Jain,
in Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual,
2020.
[BibTeX]
[Abstract]
[URL]
Standard Convolutional Neural Networks (CNNs) designed for computer vision tasks tend to have large intermediate activation maps. These require large working memory and are thus unsuitable for deployment on resource-constrained devices typically used for inference on the edge. Aggressively downsampling the images via pooling or strided convolutions can address the problem but leads to a significant decrease in accuracy due to gross aggregation of the feature map by standard pooling operators. In this paper, we introduce RNNPool, a novel pooling operator based on Recurrent Neural Networks (RNNs), that efficiently aggregates features over large patches of an image and rapidly downsamples activation maps. Empirical evaluation indicates that an RNNPool layer can effectively replace multiple blocks in a variety of architectures such as MobileNets, DenseNet when applied to standard vision tasks like image classification and face detection. That is, RNNPool can significantly decrease computational complexity and peak memory usage for inference while retaining comparable accuracy. We use RNNPool with the standard S3FD architecture to construct a face detection method that achieves state-of-the-art MAP for tiny ARM Cortex-M4 class microcontrollers with under 256 KB of RAM. Code is released at https://github.com/Microsoft/EdgeML.
@inproceedings{SahaKSVJ2020,
author = {Oindrila Saha and Aditya Kusupati and Harsha Vardhan Simhadri and Manik Varma and Prateek Jain},
title = {RNNPool: Efficient Non-linear Pooling for RAM Constrained Inference},
booktitle = {Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual},
year = {2020},
url = {https://proceedings.neurips.cc/paper/2020/hash/ebd9629fc3ae5e9f6611e2ee05a31cef-Abstract.html}
}
The Pitfalls of Simplicity Bias in Neural Networks
Harshay Shah, Kaustav Tamuly, Aditi Raghunathan, Prateek Jain and Praneeth Netrapalli,
in Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual,
2020.
[BibTeX]
[Abstract]
[URL]
Several works have proposed Simplicity Bias (SB)---the tendency of standard training procedures such as Stochastic Gradient Descent (SGD) to find simple models---to justify why neural networks generalize well [Arpit et al. 2017, Nakkiran et al. 2019, Valle-Perez et al. 2019]. However, the precise notion of simplicity remains vague. Furthermore, previous settings [Soudry et al. 2018, Gunasekar et al. 2018] that use SB to theoretically justify why neural networks generalize well do not simultaneously capture the non-robustness of neural networks---a widely observed phenomenon in practice [Goodfellow et al. 2014, Jo and Bengio 2017]. We attempt to reconcile SB and the superior standard generalization of neural networks with the non-robustness observed in practice by introducing piecewise-linear and image-based datasets, which (a) incorporate a precise notion of simplicity, (b) comprise multiple predictive features with varying levels of simplicity, and (c) capture the non-robustness of neural networks trained on real data. Using theory and empirics on these datasets, we make four observations: (i) SB of SGD and variants can be extreme: neural networks can exclusively rely on the simplest feature and remain invariant to all predictive complex features. (ii) The extreme aspect of SB could explain why seemingly benign distribution shifts and small adversarial perturbations significantly degrade model performance. (iii) Contrary to conventional wisdom, SB can also hurt generalization on the same data distribution, as SB persists even when the simplest feature has less predictive power than the more complex features. (iv) Common approaches to improve generalization and robustness---ensembles and adversarial training---can fail in mitigating SB and its pitfalls. Given the role of SB in training neural networks, we hope that the proposed datasets and methods serve as an effective testbed to evaluate novel algorithmic approaches aimed at avoiding the pitfalls of SB.
@inproceedings{ShahTRJN2020,
author = {Harshay Shah and Kaustav Tamuly and Aditi Raghunathan and Prateek Jain and Praneeth Netrapalli},
title = {The Pitfalls of Simplicity Bias in Neural Networks},
booktitle = {Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual},
year = {2020},
url = {https://proceedings.neurips.cc/paper/2020/hash/6cfe0e6127fa25df2a0ef2ae1067d915-Abstract.html}
}
DROCC: Deep Robust One-Class Classification
Sachin Goyal, Aditi Raghunathan, Moksh Jain, Harsha Vardhan Simhadri and Prateek Jain,
in Proceedings of the 37th International Conference on Machine Learning, ICML 2020, 13-18 July 2020, Virtual Event,
2020.
[BibTeX]
[Abstract]
[URL]
Classical approaches for one-class problems such as one-class SVM and isolation forest require careful feature engineering when applied to structured domains like images. State-of-the-art methods aim to leverage deep learning to learn appropriate features via two main approaches. The first approach based on predicting transformations (Golan & El-Yaniv, 2018; Hendrycks et al., 2019a) while successful in some domains, crucially depends on an appropriate domain-specific set of transformations that are hard to obtain in general. The second approach of minimizing a classical one-class loss on the learned final layer representations, e.g., DeepSVDD (Ruff et al., 2018) suffers from the fundamental drawback of representation collapse. In this work, we propose Deep Robust One Class Classification (DROCC) that is both applicable to most standard domains without requiring any side-information and robust to representation collapse. DROCC is based on the assumption that the points from the class of interest lie on a well-sampled, locally linear low dimensional manifold. Empirical evaluation demonstrates that DROCC is highly effective in two different one-class problem settings and on a range of real-world datasets across different domains: tabular data, images (CIFAR and ImageNet), audio, and time-series, offering up to 20% increase in accuracy over the state-of-the-art in anomaly detection. Code is available at https://github.com/microsoft/EdgeML.
@inproceedings{GoyalRJS020,
author = {Sachin Goyal and Aditi Raghunathan and Moksh Jain and Harsha Vardhan Simhadri and Prateek Jain},
title = {DROCC: Deep Robust One-Class Classification},
booktitle = {Proceedings of the 37th International Conference on Machine Learning, ICML 2020, 13-18 July 2020, Virtual Event},
publisher = {PMLR},
year = {2020},
volume = {119},
pages = {3711--3721},
url = {http://proceedings.mlr.press/v119/goyal20c.html}
}
Optimization and Analysis of the pAp@k Metric for Recommender Systems
Gaurush Hiranandani, Warut Vijitbenjaronk, Sanmi Koyejo and Prateek Jain,
in Proceedings of the 37th International Conference on Machine Learning, ICML 2020, 13-18 July 2020, Virtual Event,
2020.
[BibTeX]
[Abstract]
[URL]
Modern recommendation and notification systems must be robust to data imbalance, limitations on the number of recommendations/notifications, and heterogeneous engagement profiles across users. The pAp@k metric, which combines the partial-AUC and the precision@k metrics, was recently proposed to evaluate such recommendation systems and has been used in real-world deployments. Conceptually, pAp@k measures the probability of correctly ranking a top-ranked positive instance over top-ranked negative instances. Due to the combinatorial aspect surfaced by top-ranked points, little is known about the characteristics and optimization methods of pAp@k. In this paper, we analyze the learning-theoretic properties of pAp@k, particularly its benefits in evaluating modern recommender systems, and propose novel surrogates that are consistent under certain data regularity conditions. We then provide gradient descent based algorithms to optimize the surrogates directly. Our analysis and experimental evaluation suggest that pAp@k indeed exhibits a certain dual behavior with respect to partial-AUC and precision@k. Moreover, the proposed methods outperform all the baselines in various applications. Taken together, our results motivate the use of pAp@k for large-scale recommender systems with heterogeneous user-engagement.
@inproceedings{HiranandaniVKJ2020,
author = {Gaurush Hiranandani and Warut Vijitbenjaronk and Sanmi Koyejo and Prateek Jain},
title = {Optimization and Analysis of the pAp@k Metric for Recommender Systems},
booktitle = {Proceedings of the 37th International Conference on Machine Learning, ICML 2020, 13-18 July 2020, Virtual Event},
publisher = {PMLR},
year = {2020},
volume = {119},
pages = {4260--4270},
url = {http://proceedings.mlr.press/v119/hiranandani20a.html}
}
Soft Threshold Weight Reparameterization for Learnable Sparsity
Aditya Kusupati, Vivek Ramanujan, Raghav Somani, Mitchell Wortsman, Prateek Jain, Sham M. Kakade and Ali Farhadi,
in Proceedings of the 37th International Conference on Machine Learning, ICML 2020, 13-18 July 2020, Virtual Event,
2020.
[BibTeX]
[Abstract]
[URL]
Sparsity in Deep Neural Networks (DNNs) is studied extensively with the focus of maximizing prediction accuracy given an overall parameter budget. Existing methods rely on uniform or heuristic non-uniform sparsity budgets which have sub-optimal layer-wise parameter allocation resulting in a) lower prediction accuracy or b) higher inference cost (FLOPs). This work proposes Soft Threshold Reparameterization (STR), a novel use of the soft-threshold operator on DNN weights. STR smoothly induces sparsity while learning pruning thresholds thereby obtaining a non-uniform sparsity budget. Our method achieves state-of-the-art accuracy for unstructured sparsity in CNNs (ResNet50 and MobileNetV1 on ImageNet-1K), and, additionally, learns non-uniform budgets that empirically reduce the FLOPs by up to 50\%. Notably, STR boosts the accuracy over existing results by up to 10\% in the ultra sparse (99\%) regime and can also be used to induce low-rank (structured sparsity) in RNNs. In short, STR is a simple mechanism which learns effective sparsity budgets that contrast with popular heuristics. Code, pretrained models and sparsity budgets are at https://github.com/RAIVNLab/STR.
@inproceedings{KusupatiRSWJKF2020,
author = {Aditya Kusupati and Vivek Ramanujan and Raghav Somani and Mitchell Wortsman and Prateek Jain and Sham M. Kakade and Ali Farhadi},
title = {Soft Threshold Weight Reparameterization for Learnable Sparsity},
booktitle = {Proceedings of the 37th International Conference on Machine Learning, ICML 2020, 13-18 July 2020, Virtual Event},
publisher = {PMLR},
year = {2020},
volume = {119},
pages = {5544--5555},
url = {http://proceedings.mlr.press/v119/kusupati20a.html}
}
ShaRNN: A Method for Accurate Time-series Classification on Tiny Devices
Don Dennis, Alp Acar, Mandikal Vikram, Harsha Simhadri, Venkatesh Saligrama and Prateek Jain,
in Proceedings of the Thirty-second Annual Conference on Neural Information Processing Systems (NeurIPS),
2019.
[BibTeX]
[Abstract]
[URL]
Recurrent Neural Networks (RNNs) capture long dependencies and context, and
hence are the key component of typical sequential data based tasks. However, the
sequential nature of RNNs dictates a large inference cost for long sequences even if
the hardware supports parallelization. To induce long-term dependencies, and yet
admit parallelization, we introduce novel shallow RNNs. In this architecture, the
first layer splits the input sequence and runs several independent RNNs. The second
layer consumes the output of the first layer using a second RNN thus capturing
long dependencies. We provide theoretical justification for our architecture under
weak assumptions that we verify on real-world benchmarks. Furthermore, we show
that for time-series classification, our technique leads to substantially improved
inference time over standard RNNs without compromising accuracy. For example,
we can deploy audio-keyword classification on tiny Cortex M4 devices (100MHz
processor, 256KB RAM, no DSP available) which was not possible using standard
RNN models. Similarly, using ShaRNN in the popular Listen-Attend-Spell (LAS)
architecture for phoneme classification, we can reduce the lag in phoneme
classification by 10-12x while maintaining state-of-the-art accuracy.
@inproceedings{DennisAVSSJ19,
author = {Don Dennis and Alp Acar and Mandikal Vikram and Harsha Simhadri and Venkatesh Saligrama and Prateek Jain},
title = {ShaRNN: A Method for Accurate Time-series Classification on Tiny Devices},
booktitle = {Proceedings of the Thirty-second Annual Conference on Neural Information Processing Systems (NeurIPS)},
year = {2019},
url = {all_papers/DennisAVSSJ19.pdf}
}
Efficient Algorithms for Smooth Minimax Optimization
Kiran Koshy Thekumparampil, Prateek Jain, Praneeth Netrapalli and Sewoong Oh,
in Proceedings of the Thirty-second Annual Conference on Neural Information Processing Systems (NeurIPS),
2019.
[BibTeX]
[Abstract]
[URL]
This paper studies first order methods for solving smooth minimax optimization problems where is smooth and is concave for each . In terms of , we consider two settings -- strongly convex and nonconvex -- and improve upon the best known rates in both. For strongly-convex , we propose a new algorithm combining Mirror-Prox and Nesterov's AGD, and show that it can find global optimum in iterations, improving over current state-of-the-art rate of . We use this result along with an inexact proximal point method to provide rate for finding stationary points in the nonconvex setting where can be nonconvex. This improves over current best-known rate of . Finally, we instantiate our result for finite nonconvex minimax problems, i.e., , with nonconvex , to obtain convergence rate of total gradient evaluations for finding a stationary point.
@inproceedings{TJNO19,
author = {Kiran Koshy Thekumparampil and Prateek Jain and Praneeth Netrapalli and Sewoong Oh},
title = {Efficient Algorithms for Smooth Minimax Optimization},
booktitle = {Proceedings of the Thirty-second Annual Conference on Neural Information Processing Systems (NeurIPS)},
year = {2019},
url = {all_papers/TJNO19.pdf}
}
Nonlinear Inductive Matrix Completion based on One-layer Neural Networks
Kai Zhong, Zhao Song, Prateek Jain and Inderjit S. Dhillon,
in Proceedings of the Thirty-second Annual Conference on Neural Information Processing Systems (NeurIPS),
2019.
[BibTeX]
[Abstract]
[URL]
The goal of a recommendation system is to predict the interest of a user in a given item by exploiting the existing set of ratings as well as certain user/item features. A standard approach to modeling this problem is Inductive Matrix Completion where the predicted rating is modeled as an inner product of the user and the item features projected onto a latent space. In order to learn the parameters effectively from a small number of observed ratings, the latent space is constrained to be low-dimensional which implies that the parameter matrix is constrained to be low-rank. However, such bilinear modeling of the ratings can be limiting in practice and non-linear prediction functions can lead to significant improvements. A natural approach to introducing non-linearity in the prediction function is to apply a non-linear activation function on top of the projected user/item features. Imposition of non-linearities further complicates an already challenging problem that has two sources of non-convexity: a) low-rank structure of the parameter matrix, and b) non-linear activation function. We show that one can still solve the non-linear Inductive Matrix Completion problem using gradient descent type methods as long as the solution is initialized well. That is, close to the optima, the optimization function is strongly convex and hence admits standard optimization techniques, at least for certain activation functions, such as Sigmoid and tanh. We also highlight the importance of the activation function and show how ReLU can behave significantly differently than say a sigmoid function. Finally, we apply our proposed technique to recommendation systems and semi-supervised clustering, and show that our method can lead to much better performance than standard linear Inductive Matrix Completion methods.
@inproceedings{ZhongSJD19,
author = {Kai Zhong and Zhao Song and Prateek Jain and Inderjit S. Dhillon},
title = {Nonlinear Inductive Matrix Completion based on One-layer Neural Networks},
booktitle = {Proceedings of the Thirty-second Annual Conference on Neural Information Processing Systems (NeurIPS)},
year = {2019},
url = {all_papers/ZhongSJD19.pdf}
}
GesturePod: Enabling On-device Gesture-based Interaction for White Cane Users
Shishir G. Patil, Don Kurian Dennis, Chirag Pabbaraju, Nadeem Shaheer, Harsha Vardhan Simhadri, Vivek Seshadri, Manik Varma and Prateek Jain,
in Proceedings of the 32nd Annual ACM Symposium on User Interface Software and Technology (UIST),
2019.
[BibTeX]
[Abstract]
[URL]
People using white canes for navigation find it challenging to concurrently access devices such as smartphones. Building on prior research on abandonment of specialized devices, we explore a new touch free mode of interaction wherein a person with visual impairment can perform gestures on their existing white cane to trigger tasks on their smartphone. We present GesturePod, an easy-to-integrate device that clips on to any white cane, and detects gestures performed with the cane. With GesturePod, a user can perform common tasks on their smartphone without touch or even removing the phone from their pocket or bag. We discuss the challenges in building the device and our design choices. We propose a novel, efficient machine learning pipeline to train and deploy the gesture recognition model. Our in-lab study shows that GesturePod achieves 92 percent gesture recognition accuracy and can help perform common smartphone tasks faster. Our in-wild study suggests that GesturePod is a promising tool to improve smartphone access for people with VI, especially in constrained outdoor scenarios.
@inproceedings{PatilDPSSSVJ19,
author = {Shishir G. Patil and Don Kurian Dennis and Chirag Pabbaraju and Nadeem Shaheer and Harsha Vardhan Simhadri and Vivek Seshadri and Manik Varma and Prateek Jain},
title = {GesturePod: Enabling On-device Gesture-based Interaction for White Cane Users},
booktitle = {Proceedings of the 32nd Annual ACM Symposium on User Interface Software and Technology (UIST)},
year = {2019},
pages = {403--415},
note = {slides/PatilDPSSSVJ19.pdf},
url = {all_papers/PatilDPSSSVJ19.pdf},
doi = {https://doi.org/10.1145/3332165.3347881}
}
Making the Last Iterate of SGD Information Theoretically Optimal
Prateek Jain, Dheeraj Nagaraj and Praneeth Netrapalli,
in Proceedings of the Annual Conference On Learning Theory (COLT),
2019.
[BibTeX]
[Abstract]
[URL]
Stochastic gradient descent (SGD) is one of the most widely used algorithms for large scale optimization problems. While classical theoretical analysis of SGD for convex problems studies (suffix) {averages} of iterates and obtains information theoretically optimal bounds on suboptimality, the {last point} of SGD is, by far, the most preferred choice in practice. The best known results for last point of SGD (Shamir and Zhang, 2013) however, are suboptimal compared to information theoretic lower bounds by a logT factor, where T is the number of iterations. Harvey et. al (2018) shows that in fact, this additional logT factor is tight for standard step size sequences of 1/sqrt(t) and 1/t for non-strongly convex and strongly convex settings, respectively. Similarly, even for subgradient descent (GD) when applied to non-smooth, convex functions, the best known step-size sequences still lead to O(logT)-suboptimal convergence rates (on the final iterate). The main contribution of this work is to design new step size sequences that enjoy information theoretically optimal bounds on the suboptimality of {last point} of SGD as well as GD. We achieve this by designing a modification scheme, that converts one sequence of step sizes to another so that the last point of SGD/GD with modified sequence has the same suboptimality guarantees as the average of SGD/GD with original sequence. We also show that our result holds with high-probability. We validate our results through simulations which demonstrate that the new step size sequence indeed improves the final iterate significantly compared to the standard step size sequences.
@inproceedings{JainNN19,
author = {Prateek Jain and Dheeraj Nagaraj and Praneeth Netrapalli},
title = {Making the Last Iterate of SGD Information Theoretically Optimal},
booktitle = {Proceedings of the Annual Conference On Learning Theory (COLT)},
year = {2019},
pages = {1752--1755},
note = {slides/JainNN19.pdf},
url = {all_papers/JainNN19}
}
Adaptive Hard Thresholding for Near-optimal Consistent Robust Regression
Arun Sai Suggala, Kush Bhatia, Pradeep Ravikumar and Prateek Jain,
in Proceedings of the Annual Conference On Learning Theory (COLT),
2019.
[BibTeX]
[Abstract]
[URL]
We study the problem of robust linear regression with response variable corruptions. We consider the oblivious adversary model, where the adversary corrupts a fraction of the responses in complete ignorance of the data. We provide a nearly linear time estimator which consistently estimates the true regression vector, even with 1−o(1) fraction of corruptions. Existing results in this setting either don’t guarantee consistent estimates or can only handle a small fraction of corruptions. We also extend our estimator to robust sparse linear regression and show that similar guarantees hold in this setting. Finally, we apply our estimator to the problem of linear regression with heavy-tailed noise and show that our estimator consistently estimates the regression vector even when the noise has unbounded variance (e.g., Cauchy distribution), for which most existing results don’t even apply. Our estimator is based on a novel variant of outlier removal via hard thresholding in which the threshold is chosen adaptively and crucially relies on randomness to escape bad fixed points of the non-convex hard thresholding operation.
@inproceedings{SuggalaBRJ19,
author = {Arun Sai Suggala and Kush Bhatia and Pradeep Ravikumar and Prateek Jain},
title = {Adaptive Hard Thresholding for Near-optimal Consistent Robust Regression},
booktitle = {Proceedings of the Annual Conference On Learning Theory (COLT)},
year = {2019},
pages = {2892--2897},
note = {slides/SuggalaBRJ19.pdf},
url = {all_papers/SuggalaBRJ19.pdf}
}
SGD without Replacement: Sharper Rates for General Smooth Convex Functions
Dheeraj Nagaraj, Prateek Jain and Praneeth Netrapalli,
in Proceedings of the 36th International Conference on Machine Learning (ICML),
2019.
[BibTeX]
[Abstract]
[URL]
We study stochastic gradient descent without replacement (SGDo) for smooth convex functions. SGDo is widely observed to converge faster than true SGD where each sample is drawn independently with replacement (Bottou,2009) and hence, is more popular in practice. But it’s convergence properties are not well understood as sampling without replacement leads to coupling between iterates and gradients. By using method of exchangeable pairs to bound Wasserstein distance, we provide the first non-asymptotic results for SGDo when applied to general smooth, strongly-convex functions. In particular, we show that SGDo converges at a rate of O(1/K2) while SGD is known to converge at O(1/K) rate, where K denotes the number of passes over data and is required to be large enough. Existing results for SGDo in this setting require additional Hessian Lipschitz assumption (Gurbuzbalaban et al, 2015; HaoChen and Sra 2018). For small K, we show SGDo can achieve same convergence rate as SGD for general smooth strongly-convex functions. Existing results in this setting require K=1 and hold only for generalized linear models (Shamir,2016). In addition, by careful analysis of the coupling, for both large and small K, we obtain better dependence on problem dependent parameters like condition number.
@inproceedings{NagarajJN19,
author = {Dheeraj Nagaraj and Prateek Jain and Praneeth Netrapalli},
title = {SGD without Replacement: Sharper Rates for General Smooth Convex Functions},
booktitle = {Proceedings of the 36th International Conference on Machine Learning (ICML)},
year = {2019},
pages = {4703--4711},
note = {slides/NagarajJN19.pdf},
url = {all_papers/NagarajJN19.pdf}
}
Globally-convergent Iteratively Reweighted Least Squares for Robust Regression Problems
Bhaskar Mukhoty, Govind Gopakumar, Prateek Jain and Purushottam Kar,
in The 22nd International Conference on Artificial Intelligence and Statistics ( AISTATS),
2019.
[BibTeX]
[Abstract]
[URL]
We provide the first global model recovery results for the IRLS (iteratively reweighted least squares) heuristic for robust regression problems. IRLS is known to offer excellent performance, despite bad initializations and data corruption, for several parameter estimation problems. Existing analyses of IRLS frequently require careful initialization, thus offering only local convergence guarantees. We remedy this by proposing augmentations to the basic IRLS routine that not only offer guaranteed global recovery, but in practice also outperform state-of-the-art algorithms for robust regression. Our routines are more immune to hyperparameter misspecification in basic regression tasks, as well as applied tasks such as linear-armed bandit problems. Our theoretical analyses rely on a novel extension of the notions of strong convexity and smoothness to weighted strong convexity and smoothness, and establishing that sub-Gaussian designs offer bounded weighted condition numbers. These notions may be useful in analyzing other algorithms as well.
@inproceedings{MukhotyGJK19,
author = {Bhaskar Mukhoty and Govind Gopakumar and Prateek Jain and Purushottam Kar},
title = {Globally-convergent Iteratively Reweighted Least Squares for Robust Regression Problems},
booktitle = {The 22nd International Conference on Artificial Intelligence and Statistics ( AISTATS)},
year = {2019},
pages = {313--322},
url = {all_papers/MukhotyGJK19}
}
Learning Natural Programs from a Few Examples in Real-Time
Nagarajan Natarajan, Danny Simmons, Naren Datha, Prateek Jain and Sumit Gulwani,
in Proceedings of the 22nd International Conference on Artificial Intelligence and Statistics (AISTATS),
2019.
[BibTeX]
[Abstract]
[URL]
Programming by examples (PBE) is a rapidly growing subfield of AI, that aims to synthesize user-intended programs using input-output examples from the task. As users can provide only a few I/O examples, capturing user-intent accurately and ranking user-intended programs over other programs is challenging even in the simplest of the domains. Commercially deployed PBE systems often require years of engineering effort and domain expertise to devise ranking heuristics for real-time synthesis of accurate programs. But such heuristics may not cater to new domains, or even to a different segment of users from the same domain. In this work, we develop a novel, real-time, ML-based program ranking algorithm that enables synthesis of natural, user-intended, personalized programs. We make two key technical contributions: 1) a new technique to embed programs in a vector space making them amenable to ML-formulations, 2) a novel formulation that interleaves program search with ranking, enabling real-time synthesis of accurate user-intended programs. We implement our solution in the state-of-the-art PROSE framework. The proposed approach learns the intended program with just {\em one} I/O example in a variety of real-world string/date/number manipulation tasks, and outperforms state-of-the-art neural synthesis methods along multiple metrics.
@inproceedings{NatarajanSDJG19,
author = {Nagarajan Natarajan and Danny Simmons and Naren Datha and Prateek Jain and Sumit Gulwani},
title = {Learning Natural Programs from a Few Examples in Real-Time},
booktitle = {Proceedings of the 22nd International Conference on Artificial Intelligence and Statistics (AISTATS)},
year = {2019},
pages = {1714--1722},
note = {slides/NatarajanSDJG19.pdf},
url = {all_papers/NatarajanSDJG19.pdf}
}
Distributional Semantics Meets Multi-Label Learning
Vivek Gupta, Rahul Wadbude, Nagarajan Natarajan, Harish Karnick, Prateek Jain and Piyush Rai,
in Proceedings of the 33rd AAAI Conference on Artificial Intelligence,
2019.
[BibTeX]
[Abstract]
[URL]
We present a label embedding based approach to large-scale multi-label learning, drawing inspiration from ideas rooted in distributional semantics, specifically the Skip Gram Negative Sampling (SGNS) approach, widely used to learn word embeddings. Besides leading to a highly scalable model for multi-label learning, our approach highlights interesting connections between label embedding methods commonly used for multi-label learning and paragraph embedding methods commonly used for learning representations of text data. The framework easily extends to incorporating auxiliary information such as label-label correlations; this is crucial especially when many training instances are only partially annotated. To facilitate end-to-end learning, we develop a joint learning algorithm that can learn the embeddings as well as a regression model that predicts these embeddings for the new input to be annotated, via efficient gradient based methods. We demonstrate the effectiveness of our approach through an extensive set of experiments on a variety of benchmark datasets, and show that the proposed models perform favorably as compared to state-of-the-art methods for large-scale multi-label learning.
@inproceedings{GuptaWNKJR2019,
author = {Vivek Gupta, Rahul Wadbude, Nagarajan Natarajan, Harish Karnick, Prateek Jain, Piyush Rai},
title = {Distributional Semantics Meets Multi-Label Learning},
booktitle = {Proceedings of the 33rd AAAI Conference on Artificial Intelligence},
year = {2019},
url = {all_papers/GuptaWNKJR19.pdf}
}
Multiple Instance Learning for Efficient Sequential Data Classification on Resource-constrained Devices
Don Dennis, Chirag Pabbaraju, Harsha Vardhan Simhadri and Prateek Jain,
in Proceedings of the Thirty-first Annual Conference on Neural Information Processing Systems (NeurIPS),
2018.
[BibTeX]
[Abstract]
[URL]
We study the problem of fast and efficient classification of sequential data (such as time-series) on tiny devices, which is critical for various IoT related applications like audio keyword detection or gesture detection. Such tasks are cast as a standard classification task by sliding windows over the data stream to construct data points. Deploying such classification modules on tiny devices is challenging as predictions over sliding windows of data need to be invoked continuously at a high frequency. Each such predictor instance in itself is expensive as it evaluates large models over long windows of data. In this paper, we address this challenge by exploiting the following two observations about classification tasks arising in typical IoT related applications: (a) the "signature" of a particular class (e.g. an audio keyword) typically occupies a small fraction of the overall data, and (b) class signatures tend to be discernible early on in the data. We propose a method, EMI-RNN, that exploits these observations by using a multiple instance learning formulation along with an early prediction technique to learn a model that achieves better accuracy compared to baseline models, while simultaneously reducing computation by a large fraction. For instance, on a gesture detection benchmark, EMI-RNN improves standard LSTM model’s accuracy by up to 1 percent while requiring 72x less computation. This enables us to deploy such models for continuous real-time prediction on a small device such as Raspberry Pi0 and Arduino variants, a task that the baseline LSTM could not achieve. Finally, we also provide an analysis of our multiple instance learning algorithm in a simple setting and show that the proposed algorithm converges to the global optima at a linear rate, one of the first such result in this domain. The code for EMI-RNN is available at: https://github.com/Microsoft/EdgeML/tree/master/tf/examples/EMI-RNN
@inproceedings{DennisPSJ18,
author = {Don Dennis and Chirag Pabbaraju and Harsha Vardhan Simhadri and Prateek Jain},
title = {Multiple Instance Learning for Efficient Sequential Data Classification on Resource-constrained Devices},
booktitle = {Proceedings of the Thirty-first Annual Conference on Neural Information Processing Systems (NeurIPS)},
year = {2018},
pages = {10976--10987},
note = {slides/DennisPSJ18.pdf},
url = {all_papers/DennisPSJ18.pdf}
}
FastGRNN: A Fast, Accurate, Stable and Tiny Kilobyte Sized Gated Recurrent Neural Network
Aditya Kusupati, Manish Singh, Kush Bhatia, Ashish Kumar, Prateek Jain and Manik Varma,
in Proceedings of the Thirty-first Annual Conference on Neural Information Processing Systems (NeurIPS),
2018.
[BibTeX]
[Abstract]
[URL]
This paper develops the FastRNN and FastGRNN algorithms to address the twin RNN limitations of inaccurate training and inefficient prediction. Previous approaches have improved accuracy at the expense of prediction costs making them infeasible for resource-constrained and real-time applications. Unitary RNNs have increased accuracy somewhat by restricting the range of the state transition matrix's singular values but have also increased the model size as they require a larger number of hidden units to make up for the loss in expressive power. Gated RNNs have obtained state-of-the-art accuracies by adding extra parameters thereby resulting in even larger models. FastRNN addresses these limitations by adding a residual connection that does not constrain the range of the singular values explicitly and has only two extra scalar parameters. FastGRNN then extends the residual connection to a gate by reusing the RNN matrices to match state-of-the-art gated RNN accuracies but with a 2-4x smaller model. Enforcing FastGRNN's matrices to be low-rank, sparse and quantized resulted in accurate models that could be up to 35x smaller than leading gated and unitary RNNs. This allowed FastGRNN to accurately recognize the "Hey Cortana" wakeword with a 1 KB model and to be deployed on severely resource-constrained IoT microcontrollers too tiny to store other RNN models. FastGRNN's code is available online.
@inproceedings{KusupatiSBKJV18,
author = {Aditya Kusupati and Manish Singh and Kush Bhatia and Ashish Kumar and Prateek Jain and Manik Varma},
title = {FastGRNN: A Fast, Accurate, Stable and Tiny Kilobyte Sized Gated Recurrent Neural Network},
booktitle = {Proceedings of the Thirty-first Annual Conference on Neural Information Processing Systems (NeurIPS)},
year = {2018},
pages = {9031--9042},
note = {slides/fastgrnn.pdf},
url = {all_papers/KusupatiSBKJV18.pdf}
}
Support Recovery for Orthogonal Matching Pursuit: Upper and Lower bounds
Raghav Somani, Chirag Gupta, Prateek Jain and Praneeth Netrapalli,
in Proceedings of the Thirty-first Annual Conference on Neural Information Processing Systems (NeurIPS),
2018.
[BibTeX]
[Abstract]
[URL]
This paper studies the problem of sparse regression where the goal is to learn a sparse vector that best optimizes a given objective function. Under the assumption that the objective function satisfies restricted strong convexity (RSC), we analyze orthogonal matching pursuit (OMP), a greedy algorithm that is used heavily in applications, and obtain support recovery result as well as a tight generalization error bound for OMP. Furthermore, we obtain lower bounds for OMP, showing that both our results on support recovery and generalization error are tight up to logarithmic factors. To the best of our knowledge, these support recovery and generalization bounds are the first such matching upper and lower bounds (up to logarithmic factors) for {\em any} sparse regression algorithm under the RSC assumption.
@inproceedings{SomaniGJN18,
author = {Raghav Somani and Chirag Gupta and Prateek Jain and Praneeth Netrapalli},
title = {Support Recovery for Orthogonal Matching Pursuit: Upper and Lower bounds},
booktitle = {Proceedings of the Thirty-first Annual Conference on Neural Information Processing Systems (NeurIPS)},
year = {2018},
pages = {10837--10847},
note = {slides/SomaniGJN18.pdf},
url = {all_papers/SomaniGJN18.pdf}
}
Smoothed analysis for low-rank solutions to semidefinite programs in quadratic penalty form
Srinadh Bhojanapalli, Nicolas Boumal, Prateek Jain and Praneeth Netrapalli,
in Proceedings of the Annual Conference On Learning Theory (COLT),
2018.
[BibTeX]
[Abstract]
[URL]
Semidefinite programs (SDP) are important in learning and combinatorial optimization with numerous applications. In pursuit of low-rank solutions and low complexity algorithms, we consider the Burer--Monteiro factorization approach for solving SDPs. We show that all approximate local optima are global optima for the penalty formulation of appropriately rank-constrained SDPs as long as the number of constraints scales sub-quadratically with the desired rank of the optimal solution. Our result is based on a simple penalty function formulation of the rank-constrained SDP along with a smoothed analysis to avoid worst-case cost matrices. We particularize our results to two applications, namely, Max-Cut and matrix completion.
@inproceedings{BhojanapalliBJN18,
author = {Srinadh Bhojanapalli and Nicolas Boumal and Prateek Jain and Praneeth Netrapalli},
title = {Smoothed analysis for low-rank solutions to semidefinite programs in quadratic penalty form},
booktitle = {Proceedings of the Annual Conference On Learning Theory (COLT)},
year = {2018},
pages = {3243--3270},
note = {slides/BhojanapalliBJN18.pdf},
url = {all_papers/BhojanapalliBJN18.pdf}
}
Accelerating Stochastic Gradient Descent for Least Squares Regression
Prateek Jain, Sham M. Kakade, Rahul Kidambi, Praneeth Netrapalli and Aaron Sidford,
in Proceedings of the Annual Conference On Learning Theory (COLT),
2018.
[BibTeX]
[Abstract]
[URL]
There is widespread sentiment that it is not possible to effectively utilize fast gradient methods (e.g. Nesterov's acceleration, conjugate gradient, heavy ball) for the purposes of stochastic optimization due to their instability and error accumulation, a notion made precise in d'Aspremont 2008 and Devolder, Glineur, and Nesterov 2014. This work considers these issues for the special case of stochastic approximation for the least squares regression problem, and our main result refutes the conventional wisdom by showing that acceleration can be made robust to statistical errors. In particular, this work introduces an accelerated stochastic gradient method that provably achieves the minimax optimal statistical risk faster than stochastic gradient descent. Critical to the analysis is a sharp characterization of accelerated stochastic gradient descent as a stochastic process. We hope this characterization gives insights towards the broader question of designing simple and effective accelerated stochastic methods for more general convex and non-convex optimization problems.
@inproceedings{JKKNS18,
author = {Prateek Jain and Sham M. Kakade and Rahul Kidambi and Praneeth Netrapalli and Aaron Sidford},
title = {Accelerating Stochastic Gradient Descent for Least Squares Regression},
booktitle = {Proceedings of the Annual Conference On Learning Theory (COLT)},
year = {2018},
pages = {545--604},
note = {slides/JKNNS18.pdf},
url = {all_papers/JKKNS18.pdf}
}
Differentially Private Matrix Completion Revisited
Prateek Jain, Om Dipakbhai Thakkar and Abhradeep Thakurta,
in Proceedings of the 35th International Conference on Machine Learning (ICML),
2018.
[BibTeX]
[Abstract]
[URL]
We study the problem of privacy-preserving collaborative filtering where the objective is to reconstruct the entire users-items preference matrix using a few observed preferences of users for some of the items. Furthermore, the collaborative filtering algorithm should reconstruct the preference matrix while preserving the privacy of each user. We study this problem in the setting of joint differential privacy where each user computes her own preferences for all the items, without violating privacy of other users' preferences. We provide the first provably differentially private algorithm with formal utility guarantees for this problem. Our algorithm is based on the Frank-Wolfe (FW) method, and consistently estimates the underlying preference matrix as long as the number of users $m$ is $\omega(n^{5/4})$, where $n$ is the number of items, and each user provides her preference for at least $\sqrt{n}$ randomly selected items. We also empirically evaluate our FW-based algorithm on a suite of datasets, and show that our method provides nearly same accuracy as the state-of-the-art non-private algorithm, and outperforms the state-of-the-art private algorithm by as much as 30 percent.
@inproceedings{JTT18,
author = {Prateek Jain and Om Dipakbhai Thakkar and Abhradeep Thakurta},
title = {Differentially Private Matrix Completion Revisited},
booktitle = {Proceedings of the 35th International Conference on Machine Learning (ICML)},
year = {2018},
pages = {2220--2229},
note = {slides/JTT18.pdf},
url = {all_papers/JTT18.pdf}
}
Neural-Guided Deductive Search for Real-Time Program Synthesis from Examples
Ashwin Kalyan, Abhishek Mohta, Oleksandr Polozov, Dhruv Batra, Prateek Jain and Sumit Gulwani,
in Proceedings of the International Conference on Learning Representations (ICLR),
2018.
[BibTeX]
[Abstract]
[URL]
Synthesizing user-intended programs from a small number of input-output exam-
ples is a challenging problem with several important applications like spreadsheet
manipulation, data wrangling and code refactoring. Existing synthesis systems
either completely rely on deductive logic techniques that are extensively hand-
engineered or on purely statistical models that need massive amounts of data, and in
general fail to provide real-time synthesis on challenging benchmarks. In this work,
we propose Neural Guided Deductive Search (NGDS), a hybrid synthesis technique
that combines the best of both symbolic logic techniques and statistical models.
Thus, it produces programs that satisfy the provided specifications by construction
and generalize well on unseen examples, similar to data-driven systems. Our
technique effectively utilizes the deductive search framework to reduce the learning
problem of the neural component to a simple supervised learning setup. Further,
this allows us to both train on sparingly available real-world data and still leverage
powerful recurrent neural network encoders. We demonstrate the effectiveness
of our method by evaluating on real-world customer scenarios by synthesizing
accurate programs with up to 12× speed-up compared to state-of-the-art systems.
@inproceedings{kalyanMPBJG18,
author = {Ashwin Kalyan and Abhishek Mohta and Oleksandr Polozov and Dhruv Batra and Prateek Jain and Sumit Gulwani},
title = {Neural-Guided Deductive Search for Real-Time Program Synthesis from Examples},
booktitle = {Proceedings of the International Conference on Learning Representations (ICLR)},
year = {2018},
note = {slides/kalyanMPBJG18.pdf},
url = {all_papers/kalyanMPBJG18.pdf}
}
On the insufficiency of existing momentum schemes for Stochastic Optimization
Rahul Kidambi, Praneeth Netrapalli, Prateek Jain and Sham M. Kakade,
in Proceedings of the International Conference on Learning Representations (ICLR),
2018.
[BibTeX]
[Abstract]
[URL]
Momentum based stochastic gradient methods such as heavy ball (HB) and Nesterov's accelerated gradient descent (NAG) method are widely used in practice for training deep networks and other supervised learning models, as they often provide significant improvements over stochastic gradient descent (SGD). Rigorously speaking, fast gradient methods have provable improvements over gradient descent only for the deterministic case, where the gradients are exact. In the stochastic case, the popular explanations for their wide applicability is that when these fast gradient methods are applied in the stochastic case, they partially mimic their exact gradient counterparts, resulting in some practical gain. This work provides a counterpoint to this belief by proving that there exist simple problem instances where these methods cannot outperform SGD despite the best setting of its parameters. These negative problem instances are, in an informal sense, generic; they do not look like carefully constructed pathological instances. These results suggest (along with empirical evidence) that HB or NAG's practical performance gains are a by-product of minibatching.
Furthermore, this work provides a viable (and provable) alternative, which, on the same set of problem instances, significantly improves over HB, NAG, and SGD's performance. This algorithm, referred to as Accelerated Stochastic Gradient Descent (ASGD), is a simple to implement stochastic algorithm, based on a relatively less popular variant of Nesterov's Acceleration. Extensive empirical results in this paper show that ASGD has performance gains over HB, NAG, and SGD. The code for implementing the ASGD Algorithm can be found at https://github.com/rahulkidambi/AccSGD.
@inproceedings{KidambiNJK18,
author = {Rahul Kidambi and Praneeth Netrapalli and Prateek Jain and Sham M. Kakade},
title = {On the insufficiency of existing momentum schemes for Stochastic Optimization},
booktitle = {Proceedings of the International Conference on Learning Representations (ICLR)},
year = {2018},
note = {slides/KidambiNJK18.pdf},
url = {all_papers/KidambiNJK18.pdf}
}
FlashProfile: a framework for synthesizing data profiles
Saswat Padhi, Prateek Jain, Daniel Perelman, Oleksandr Polozov, Sumit Gulwani and Todd D. Millstein,
in Proceedings of Object-Oriented Programming, Systems, Languages and Applications (OOPSLA),
2018.
[BibTeX]
[Abstract]
[URL]
We address the problem of learning a syntactic profile for a collection of strings, i.e. a set of regex-like patterns
that succinctly describe the syntactic variations in the strings. Real-world datasets, typically curated from
multiple sources, often contain data in various syntactic formats. Thus, any data processing task is preceded
by the critical step of data format identification. However, manual inspection of data to identify the different
formats is infeasible in standard big-data scenarios.
Prior techniques are restricted to a small set of pre-defined patterns (e.g. digits, letters, words, etc.), and
provide no control over granularity of profiles. We define syntactic profiling as a problem of clustering strings
based on syntactic similarity, followed by identifying patterns that succinctly describe each cluster. We present
a technique for synthesizing such profiles over a given language of patterns, that also allows for interactive
refinement by requesting a desired number of clusters.
Using a state-of-the-art inductive synthesis framework, PROSE, we have implemented our technique as
FlashProfile. Across 153 tasks over 75 large real datasets, we observe a median profiling time of only 0.7 s.
Furthermore, we show that access to syntactic profiles may allow for more accurate synthesis of programs, i.e.
using fewer examples, in programming-by-example (PBE) workflows such as Flash Fill.
@inproceedings{PadhiJPPGM18,
author = {Saswat Padhi and Prateek Jain and Daniel Perelman and Oleksandr Polozov and Sumit Gulwani and Todd D. Millstein},
title = {FlashProfile: a framework for synthesizing data profiles},
booktitle = {Proceedings of Object-Oriented Programming, Systems, Languages and Applications (OOPSLA)},
year = {2018},
pages = {150:1--150:28},
note = {slides/PadhiJPPGM18.pdf},
url = {all_papers/PadhiJPPGM18.pdf},
doi = {https://doi.org/10.1145/3276520}
}
A Markov Chain Theory Approach to Characterizing the Minimax Optimality of Stochastic Gradient Descent (for Least Squares)
Prateek Jain, Sham M. Kakade, Rahul Kidambi, Praneeth Netrapalli, Venkata Krishna Pillutla and Aaron Sidford,
in Proceedings of the Thirty-seventh IARCS Annual Conference on Foundations of Software Technology and Theoretical Computer Science (FSTTCS),
2017.
[BibTeX]
[Abstract]
[URL]
This work provides a simplified proof of the statistical minimax optimality of (iterate averaged) stochastic gradient descent (SGD), for the special case of least squares. This result is obtained by analyzing SGD as a stochastic process and by sharply characterizing the stationary covariance matrix of this process. The finite rate optimality characterization captures the constant factors and addresses model mis-specification.
@inproceedings{JainKKNPS17,
author = {Prateek Jain and Sham M. Kakade and Rahul Kidambi and Praneeth Netrapalli and Venkata Krishna Pillutla and Aaron Sidford},
title = {A Markov Chain Theory Approach to Characterizing the Minimax Optimality of Stochastic Gradient Descent (for Least Squares)},
booktitle = {Proceedings of the Thirty-seventh IARCS Annual Conference on Foundations of Software Technology and Theoretical Computer Science (FSTTCS)},
year = {2017},
pages = {2:1--2:10},
url = {all_papers/JKKNPS17.pdf}
}
Consistent Robust Regression
Kush Bhatia, Prateek Jain, Parameswaran Kamalaruban and Purushottam Kar,
in Proceedings of the 30th Annual Conference on Advances in Neural Information Processing Systems (NIPS),
2017.
[BibTeX]
[Abstract]
[URL]
We present the first efficient and provably consistent estimator for the robust regression problem. The area of robust learning and optimization has generated a significant amount of interest in the learning and statistics communities in recent years owing to its applicability in scenarios with corrupted data, as well as in handling model mis-specifications. In particular, special interest has been devoted to the fundamental problem of robust linear regression where estimators that can tolerate corruption in up to a constant fraction of the response variables are widely studied. Surprisingly however, to this date, we are not aware of a polynomial time estimator that offers a consistent estimate in the presence of dense, unbounded corruptions. In this work we present such an estimator, called CRR. This solves an open problem put forward in the work of (Bhatia et al, 2015). Our consistency analysis requires a novel two-stage proof technique involving a careful analysis of the stability of ordered lists which may be of independent interest. We show that CRR not only offers consistent estimates, but is empirically far superior to several other recently proposed algorithms for the robust regression problem, including extended Lasso and the TORRENT algorithm. In comparison, CRR offers comparable or better model recovery but with runtimes that are faster by an order of magnitude.
@inproceedings{BhatiaJKK17,
author = {Kush Bhatia and Prateek Jain and Parameswaran Kamalaruban and Purushottam Kar},
title = {Consistent Robust Regression},
booktitle = {Proceedings of the 30th Annual Conference on Advances in Neural Information Processing Systems (NIPS)},
year = {2017},
pages = {2107--2116},
url = {all_papers/BhatiaJKK17.pdf}
}
Learning Mixture of Gaussians with Streaming Data
Aditi Raghunathan, Prateek Jain and Ravishankar Krishnaswamy,
in Proceedings of the 30th Annual Conference on Advances in Neural Information Processing Systems (NIPS),
2017.
[BibTeX]
[URL]
@inproceedings{RaghunathanJK17,
author = {Aditi Raghunathan and Prateek Jain and Ravishankar Krishnaswamy},
title = {Learning Mixture of Gaussians with Streaming Data},
booktitle = {Proceedings of the 30th Annual Conference on Advances in Neural Information Processing Systems (NIPS)},
year = {2017},
pages = {6608--6617},
url = {all_papers/RaghunathanJK17.pdf}
}
Programming by Examples: PL meets ML
Sumit Gulwani and Prateek Jain,
in Proceedings of the 15th Asian Symposium on Programming Languages and Systems (APLAS),
2017.
[BibTeX]
[Abstract]
[URL]
Programming by Examples (PBE) involves synthesizing intended
programs in an underlying domain-specific language from
example-based specifications. PBE systems are already revolutionizing
the application domain of data wrangling and are set to significantly
impact several other domains including code refactoring. There are three key components in a PBE system.
(i) A search algorithm that can efficiently search for programs that are consistent with the examples
provided by the user.
We leverage a divide-and-conquer-based deductive search paradigm that
inductively reduces the problem of synthesizing a program expression of a certain kind that
satisfies a given specification into sub-problems that refer to
sub-expressions or sub-specifications.
(ii) Program ranking techniques to pick an intended program from among the many that satisfy the examples
provided by the user. We leverage features of the program structure as well of the outputs generated by the program on test inputs.
(iii) User interaction models to facilitate usability and debuggability.
We leverage active-learning techniques based on clustering inputs and synthesizing
multiple programs. Each of these PBE components leverage both symbolic reasoning and heuristics.
We make the case for synthesizing these heuristics from
training data using appropriate machine learning methods.
This can not only lead to better heuristics, but can also enable easier development,
maintenance, and even personalization of a PBE system.
@inproceedings{GulwaniJ17,
author = {Sumit Gulwani and Prateek Jain},
title = {Programming by Examples: PL meets ML},
booktitle = {Proceedings of the 15th Asian Symposium on Programming Languages and Systems (APLAS)},
year = {2017},
url = {all_papers/GulwaniJ17_APLAS.pdf}
}
Active Heteroscedastic Regression
Kamalika Chaudhuri, Prateek Jain and Nagarajan Natarajan,
in Proceedings of the 34th International Conference on Machine Learning (ICML),
2017.
[BibTeX]
[Abstract]
[URL]
@inproceedings{ChaudhuriJN17,
author = {Kamalika Chaudhuri and Prateek Jain and Nagarajan Natarajan},
title = {Active Heteroscedastic Regression},
booktitle = {Proceedings of the 34th International Conference on Machine Learning (ICML)},
year = {2017},
pages = {694--702},
note = {slides/test.pdf},
url = {http://proceedings.mlr.press/v70/chaudhuri17a.html}
}
Nearly Optimal Robust Matrix Completion
Yeshwanth Cherapanamjeri, Kartik Gupta and Prateek Jain,
in Proceedings of the 34th International Conference on Machine Learning (ICML),
2017.
[BibTeX]
[Abstract]
[URL]
@inproceedings{CherapanamjeriGJ17,
author = {Yeshwanth Cherapanamjeri and Kartik Gupta and Prateek Jain},
title = {Nearly Optimal Robust Matrix Completion},
booktitle = {Proceedings of the 34th International Conference on Machine Learning (ICML)},
year = {2017},
pages = {797--805},
url = {http://proceedings.mlr.press/v70/cherapanamjeri17a.html}
}
ProtoNN: Compressed and Accurate kNN for Resource-scarce Devices
Chirag Gupta, Arun Sai Suggala, Ankit Goyal, Harsha Vardhan Simhadri, Bhargavi Paranjape, Ashish Kumar, Saurabh Goyal, Raghavendra Udupa, Manik Varma and Prateek Jain,
in Proceedings of the 34th International Conference on Machine Learning (ICML),
2017.
[BibTeX]
[Abstract]
[URL]
@inproceedings{GuptaSGSPKGUVJ17,
author = {Chirag Gupta and Arun Sai Suggala and Ankit Goyal and Harsha Vardhan Simhadri and Bhargavi Paranjape and Ashish Kumar and Saurabh Goyal and Raghavendra Udupa and Manik Varma and Prateek Jain},
title = {ProtoNN: Compressed and Accurate kNN for Resource-scarce Devices},
booktitle = {Proceedings of the 34th International Conference on Machine Learning (ICML)},
year = {2017},
pages = {1331--1340},
url = {all_papers/GuptaSGSPKGUVJ17_ICML.pdf}
}
Recovery Guarantees for One-hidden-layer Neural Networks
Kai Zhong, Zhao Song, Prateek Jain, Peter L. Bartlett and Inderjit S. Dhillon,
in Proceedings of the 34th International Conference on Machine Learning (ICML),
2017.
[BibTeX]
[Abstract]
[URL]
@inproceedings{ZhongSJBD17,
author = {Kai Zhong and Zhao Song and Prateek Jain and Peter L. Bartlett and Inderjit S. Dhillon},
title = {Recovery Guarantees for One-hidden-layer Neural Networks},
booktitle = {Proceedings of the 34th International Conference on Machine Learning (ICML)},
year = {2017},
pages = {4140--4149},
note = {slides/ZhongSJBD17.pdf},
url = {all_papers/ZSJBD17_ICML.pdf}
}
Thresholding Based Outlier Robust PCA
Yeshwanth Cherapanamjeri, Prateek Jain and Praneeth Netrapalli,
in Proceedings of the 30th Conference on Learning Theory (COLT),
2017.
[BibTeX]
[Abstract]
[URL]
@inproceedings{CherapanamjeriJN17,
author = {Yeshwanth Cherapanamjeri and Prateek Jain and Praneeth Netrapalli},
title = {Thresholding Based Outlier Robust PCA},
booktitle = {Proceedings of the 30th Conference on Learning Theory (COLT)},
year = {2017},
pages = {593--628},
url = {http://proceedings.mlr.press/v65/cherapanamjeri17a.html}
}
Fast second-order cone programming for safe mission planning
Kai Zhong, Prateek Jain and Ashish Kapoor,
in 2017 IEEE International Conference on Robotics and Automation (ICRA),
2017.
[BibTeX]
[Abstract]
[URL]
@inproceedings{ZhongJK17,
author = {Kai Zhong and Prateek Jain and Ashish Kapoor},
title = {Fast second-order cone programming for safe mission planning},
booktitle = {2017 IEEE International Conference on Robotics and Automation (ICRA)},
year = {2017},
pages = {79--86},
url = {https://doi.org/10.1109/ICRA.2017.7989014},
doi = {https://doi.org/10.1109/ICRA.2017.7989014}
}
Global Convergence of Non-Convex Gradient Descent for Computing Matrix Squareroot
Prateek Jain, Chi Jin, Sham M. Kakade and Praneeth Netrapalli,
in Proceedings of the 20th International Conference on Artificial Intelligence and Statistics (AISTATS),
2017.
[BibTeX]
[Abstract]
[URL]
@inproceedings{JJKN17,
author = {Prateek Jain and Chi Jin and Sham M. Kakade and Praneeth Netrapalli},
title = {Global Convergence of Non-Convex Gradient Descent for Computing Matrix Squareroot},
booktitle = {Proceedings of the 20th International Conference on Artificial Intelligence and Statistics (AISTATS)},
year = {2017},
pages = {479--488},
url = {http://proceedings.mlr.press/v54/jain17a.html}
}
Scalable Optimization of Multivariate Performance Measures in Multi-instance Multi-label Learning
Apoorv Aggarwal, Sandip Ghoshal, Ankith M. S. Shetty, Suhit Sinha, Ganesh Ramakrishnan, Purushottam Kar and Prateek Jain,
in Proceedings of the Thirty-First AAAI Conference on Artificial Intelligence (AAAI),
2017.
[BibTeX]
[Abstract]
[URL]
@inproceedings{AggarwalGSSRKJ17,
author = {Apoorv Aggarwal and Sandip Ghoshal and Ankith M. S. Shetty and Suhit Sinha and Ganesh Ramakrishnan and Purushottam Kar and Prateek Jain},
title = {Scalable Optimization of Multivariate Performance Measures in Multi-instance Multi-label Learning},
booktitle = {Proceedings of the Thirty-First AAAI Conference on Artificial Intelligence (AAAI)},
year = {2017},
pages = {1698--1704},
url = {all_papers/AggarwalGSSRKJ17.pdf}
}
Structured Sparse Regression via Greedy Hard Thresholding
Prateek Jain, Nikhil Rao and Inderjit S. Dhillon,
in Advances in Neural Information Processing Systems 29: Annual Conference on Neural Information Processing Systems (NIPS),
2016.
[BibTeX]
[Abstract]
[URL]
@inproceedings{JRD16,
author = {Prateek Jain and Nikhil Rao and Inderjit S. Dhillon},
title = {Structured Sparse Regression via Greedy Hard Thresholding},
booktitle = {Advances in Neural Information Processing Systems 29: Annual Conference on Neural Information Processing Systems (NIPS)},
year = {2016},
pages = {1516--1524},
url = {http://papers.nips.cc/paper/6425-structured-sparse-regression-via-greedy-hard-thresholding}
}
Regret Bounds for Non-decomposable Metrics with Missing Labels
Nagarajan Natarajan and Prateek Jain,
in Advances in Neural Information Processing Systems 29: Annual Conference on Neural Information Processing Systems (NIPS),
2016.
[BibTeX]
[Abstract]
[URL]
@inproceedings{NatarajanJ16,
author = {Nagarajan Natarajan and Prateek Jain},
title = {Regret Bounds for Non-decomposable Metrics with Missing Labels},
booktitle = {Advances in Neural Information Processing Systems 29: Annual Conference on Neural Information Processing Systems (NIPS)},
year = {2016},
pages = {2874--2882},
url = {http://papers.nips.cc/paper/6178-regret-bounds-for-non-decomposable-metrics-with-missing-labels}
}
Selective inference for group-sparse linear models
Fan Yang, Rina Foygel Barber, Prateek Jain and John D. Lafferty,
in Advances in Neural Information Processing Systems 29: Annual Conference on Neural Information Processing Systems (NIPS),
2016.
[BibTeX]
[Abstract]
[URL]
@inproceedings{YangBJL16,
author = {Fan Yang and Rina Foygel Barber and Prateek Jain and John D. Lafferty},
title = {Selective inference for group-sparse linear models},
booktitle = {Advances in Neural Information Processing Systems 29: Annual Conference on Neural Information Processing Systems (NIPS)},
year = {2016},
pages = {2469--2477},
url = {http://papers.nips.cc/paper/6437-selective-inference-for-group-sparse-linear-models}
}
Mixed Linear Regression with Multiple Components
Kai Zhong, Prateek Jain and Inderjit S. Dhillon,
in Advances in Neural Information Processing Systems 29: Annual Conference on Neural Information Processing Systems (NIPS),
2016.
[BibTeX]
[Abstract]
[URL]
@inproceedings{ZhongJD16,
author = {Kai Zhong and Prateek Jain and Inderjit S. Dhillon},
title = {Mixed Linear Regression with Multiple Components},
booktitle = {Advances in Neural Information Processing Systems 29: Annual Conference on Neural Information Processing Systems (NIPS)},
year = {2016},
pages = {2190--2198},
url = {http://papers.nips.cc/paper/6240-mixed-linear-regression-with-multiple-components}
}
Diverse Yet Efficient Retrieval using Locality Sensitive Hashing
Vidyadhar Rao, Prateek Jain and C. V. Jawahar,
in Proceedings of the 2016 ACM on International Conference on Multimedia Retrieval (ICMR),
2016.
[BibTeX]
[URL]
@inproceedings{RaoJJ16,
author = {Vidyadhar Rao and Prateek Jain and C. V. Jawahar},
title = {Diverse Yet Efficient Retrieval using Locality Sensitive Hashing},
booktitle = {Proceedings of the 2016 ACM on International Conference on Multimedia Retrieval (ICMR)},
year = {2016},
pages = {189--196},
url = {http://doi.acm.org/10.1145/2911996.2911998},
doi = {https://doi.org/10.1145/2911996.2911998}
}
Streaming PCA: Matching Matrix Bernstein and Near-Optimal Finite Sample Guarantees for Oja's Algorithm
Prateek Jain, Chi Jin, Sham M. Kakade, Praneeth Netrapalli and Aaron Sidford,
in Proceedings of the 29th Conference on Learning Theory (COLT),
2016.
[BibTeX]
[Abstract]
[URL]
@inproceedings{JainJKNS16,
author = {Prateek Jain and Chi Jin and Sham M. Kakade and Praneeth Netrapalli and Aaron Sidford},
title = {Streaming PCA: Matching Matrix Bernstein and Near-Optimal Finite Sample Guarantees for Oja's Algorithm},
booktitle = {Proceedings of the 29th Conference on Learning Theory (COLT)},
year = {2016},
pages = {1147--1164},
url = {http://jmlr.org/proceedings/papers/v49/jain16.html}
}
Tensor vs. Matrix Methods: Robust Tensor Decomposition under Block Sparse Perturbations
Anima Anandkumar, Prateek Jain, Yang Shi and U. N. Niranjan,
in Proceedings of the 19th International Conference on Artificial Intelligence and Statistics (AISTATS),
2016.
[BibTeX]
[Abstract]
[URL]
@inproceedings{AnandkumarJSN16,
author = {Anima Anandkumar and Prateek Jain and Yang Shi and U. N. Niranjan},
title = {Tensor vs. Matrix Methods: Robust Tensor Decomposition under Block Sparse Perturbations},
booktitle = {Proceedings of the 19th International Conference on Artificial Intelligence and Statistics (AISTATS)},
year = {2016},
pages = {268--276},
url = {http://jmlr.org/proceedings/papers/v51/anandkumar16.html}
}
Efficient Matrix Sensing Using Rank-1 Gaussian Measurements
Kai Zhong, Prateek Jain and Inderjit S. Dhillon,
in Algorithmic Learning Theory - 26th International Conference (ALT),
2015.
[BibTeX]
[Abstract]
[URL]
@inproceedings{ZhongJD15,
author = {Kai Zhong and Prateek Jain and Inderjit S. Dhillon},
title = {Efficient Matrix Sensing Using Rank-1 Gaussian Measurements},
booktitle = {Algorithmic Learning Theory - 26th International Conference (ALT)},
year = {2015},
pages = {3--18},
url = {https://doi.org/10.1007/978-3-319-24486-0_1},
doi = {https://doi.org/10.1007/978-3-319-24486-0_1}
}
Robust Regression via Hard Thresholding
Kush Bhatia, Prateek Jain and Purushottam Kar,
in Proceedings of the 28th Annual Conference on Advances in Neural Information Processing Systems (NIPS),
2015.
[BibTeX]
[Abstract]
[URL]
We study the problem of Robust Least Squares Regression (RLSR) where several response variables can be adversarially corrupted. More specifically, for a data matrix $X\in \R^{p\times n}$ and an underlying model $\bto$, the response vector is generated as $\y=X^T\bto+\b$ where $\b\in \R^n$ is the corruption vector supported over at most $C\cdot n$ coordinates. Existing exact recovery results for RLSR focus solely on $L_1$-penalty based convex formulations and impose relatively strict model assumptions such as requiring the corruptions $\b$ to be selected independently of $X$.
In this work, we study a simple hard-thresholding algorithm called \alg which, under mild conditions on $X$, can recover $\bto$ exactly even if $\b$ corrupts the response variables in an \emph{adversarial} manner, i.e. both the support and entries of $\b$ are selected adversarially after observing $X$ and $\bto$. Our results hold under \emph{deterministic} assumptions which are satisfied if $X$ is sampled from any sub-Gaussian distribution. Finally unlike existing results that apply only to a fixed $\bto$, generated independently of $X$, our results are \emph{universal} and hold for any $\bto\in \R^p$.
Next, we propose gradient descent-based extensions of \alg that can scale efficiently to large scale problems, such as high dimensional sparse recovery, and prove similar recovery guarantees for these extensions. Empirically we find \alg, and more so its extensions, offering significantly faster recovery than the state-of-the-art $L_1$ solvers. Empirically we find \alg, and more so its extensions, offering significantly faster recovery than the state-of-the-art $L_1$ solvers. For instance, even on moderate-sized datasets (with $p=50K$) with around $40\%$ corrupted responses, a variant of our proposed method called \alg-HYB is more than $20\times$ faster than the best $L_1$ solver.
@inproceedings{BhatiaJK15,
author = {Kush Bhatia and Prateek Jain and Purushottam Kar},
title = {Robust Regression via Hard Thresholding},
booktitle = {Proceedings of the 28th Annual Conference on Advances in Neural Information Processing Systems (NIPS)},
year = {2015},
url = {all_papers/BJK15_NIPS.pdf}
}
Sparse Local Embeddings for Extreme Multi-label Classification
Kush Bhatia, Himanshu Jain, Purushottam Kar, Manik Varma and Prateek Jain,
in Proceedings of the 28th Annual Conference on Advances in Neural Information Processing Systems (NIPS),
2015.
[BibTeX]
[Abstract]
[URL]
The objective in extreme multi-label learning is to train a classifier that can automatically
tag a novel data point with the most relevant subset of labels from an
extremely large label set. Embedding based approaches attempt to make training
and prediction tractable by assuming that the training label matrix is low-rank and
reducing the effective number of labels by projecting the high dimensional label
vectors onto a low dimensional linear subspace. Still, leading embedding approaches
have been unable to deliver high prediction accuracies, or scale to large
problems as the low rank assumption is violated in most real world applications.
In this paper we develop the SLEEC classifier to address both limitations. The
main technical contribution in SLEEC is a formulation for learning a small ensemble
of local distance preserving embeddings which can accurately predict infrequently
occurring (tail) labels. This allows SLEEC to break free of the traditional
low-rank assumption and boost classification accuracy by learning embeddings
which preserve pairwise distances between only the nearest label vectors.
We conducted extensive experiments on several real-world, as well as benchmark
data sets and compared our method against state-of-the-art methods for extreme
multi-label classification. Experiments reveal that SLEEC can make significantly
more accurate predictions then the state-of-the-art methods including both
embedding-based (by as much as 35\%) as well as tree-based (by as much as 6\%)
methods. SLEEC can also scale efficiently to data sets with a million labels which
are beyond the pale of leading embedding methods.
@inproceedings{BhatiaJKVJ15,
author = {Kush Bhatia and Himanshu Jain and Purushottam Kar and Manik Varma and Prateek Jain},
title = {Sparse Local Embeddings for Extreme Multi-label Classification},
booktitle = {Proceedings of the 28th Annual Conference on Advances in Neural Information Processing Systems (NIPS)},
year = {2015},
url = {all_papers/BJKVJ15_NIPS.pdf}
}
Predtron: A Family of Online Algorithms for General Prediction Problems
Prateek Jain, Nagarajan Natarajan and Ambuj Tewari,
in Proceedings of the 28th Annual Conference on Advances in Neural Information Processing Systems (NIPS),
2015.
[BibTeX]
[Abstract]
[URL]
Modern prediction problems arising in multilabel learning and learning to rank pose unique challenges to the classical theory of supervised learning. These problems have large prediction and label spaces of a combinatorial nature and involve sophisticated loss functions. We offer a general framework to derive mistake driven online algorithms and associated loss bounds. The key ingredients in our framework are a general loss function, a general vector space representation of predictions, and a notion of margin with respect to a general norm. Our general algorithm, \algoname, yields the perceptron algorithm and its variants when instantiated on classic problems such as binary classification, multiclass classification, ordinal regression, and multilabel classification. For multilabel ranking and subset ranking, we derive novel algorithms, notions of margins, and loss bounds. A simulation study confirms the behavior predicted by our bounds and demonstrates the flexibility of the design choices in our framework.
@inproceedings{JNT15,
author = {Prateek Jain and Nagarajan Natarajan and Ambuj Tewari},
title = {Predtron: A Family of Online Algorithms for General Prediction Problems},
booktitle = {Proceedings of the 28th Annual Conference on Advances in Neural Information Processing Systems (NIPS)},
year = {2015},
url = {all_papers/JNT15_NIPS.pdf}
}
Alternating Minimization for Regression Problems with Vector-valued Outputs
Prateek Jain and Ambuj Tewari,
in Proceedings of the 28th Annual Conference on Advances in Neural Information Processing Systems (NIPS),
2015.
[BibTeX]
[Abstract]
[URL]
In regression problems involving vector-valued outputs (or equivalently, multiple responses), it is well known that the maximum likelihood estimator (MLE), which takes noise covariance structure into account, can be significantly more accurate than the ordinary least squares (OLS) estimator. However, existing literature compares OLS and MLE in terms of their asymptotic, not finite sample, guarantees. More crucially, computing the MLE in general requires solving a non-convex optimization problem and is not known to be efficiently solvable. We provide finite sample upper and lower bounds on the estimation error of OLS and MLE, in two popular models: a) Pooled model, b) Seemingly Unrelated Regression (SUR) model. We provide precise instances where the MLE is significantly more accurate than OLS. Furthermore, for both models, we show that the output of a computationally efficient alternating minimization procedure enjoys the same performance guarantee as MLE, up to universal constants. Finally, we show that for high-dimensional settings as well, the alternating minimization procedure leads to significantly more accurate solutions than the corresponding OLS solutions but with error bound that depends only logarithmically on the data dimensionality.
@inproceedings{JT15,
author = {Prateek Jain and Ambuj Tewari},
title = {Alternating Minimization for Regression Problems with Vector-valued Outputs},
booktitle = {Proceedings of the 28th Annual Conference on Advances in Neural Information Processing Systems (NIPS)},
year = {2015},
url = {all_papers/JT15_NIPS.pdf}
}
Surrogate Functions for Maximizing Precision at the Top
Purushottam Kar, Harikrishna Narasimhan and Prateek Jain,
in Proceedings of the 32nd International Conference on Machine Learning (ICML),
2015.
[BibTeX]
[Abstract]
[URL]
The problem of maximizing precision at top, also dubbed Precision@k, finds relevance in myriad learning applications such as ranking, multi-label classification, and learning with severe label imbalances. Despite its popularity, Precision@k is not known to have a surrogate function that upper bounds it. Similarly, notions of consistency under certain noise/margin conditions are also not explored.
In this work, we devise two novel convex surrogate functions for Precision@k, that upper bound it and are motivated by certain natural notions of margin for Precision@k performance measure. We also provide two novel perceptron algorithms for Precision@k that have interesting mistake bounds w.r.t. the proposed surrogates.
Moreover, we devise scalable stochastic gradient descent style methods for our proposed surrogates and prove convergence bounds for the same. Our convergence bounds rely on a strong uniform convergence bound for Precsion@k and crucially exploit the structural simplicity of Precision@k. We conclude with experimental evidence of superiority of our surrogates when compared to the structural SVM surrogate \cite{Joachims05}, a state-of-the-art approach to optimize Precision@k.
@inproceedings{KarNJ15,
author = {Purushottam Kar and Harikrishna Narasimhan and Prateek Jain},
title = {Surrogate Functions for Maximizing Precision at the Top},
booktitle = {Proceedings of the 32nd International Conference on Machine Learning (ICML)},
year = {2015},
pages = {189--198},
url = {all_papers/KNJ15_ICML.pdf}
}
Optimizing Non-decomposable Performance Measures: A Tale of Two Classes
Harikrishna Narasimhan, Purushottam Kar and Prateek Jain,
in Proceedings of the 32nd International Conference on Machine Learning (ICML),
2015.
[BibTeX]
[Abstract]
[URL]
Modern classification problems frequently present mild to severe label imbalance as well as specific requirements on classification characteristics, and require optimizing performance measures that are non-decomposable over the dataset, such as F-measure. Such measures have spurred much interest and pose specific challenges to learning algorithms since their non-additive nature precludes a direct application of well-studied large scale optimization methods such as stochastic gradient descent.
In this paper we reveal that for two large families of performance measures that can be expressed as functions of true positive/negative rates, it is indeed possible to implement point stochastic updates. The families we consider are concave and pseudo-linear functions of TPR, TNR which cover several popularly used performance measures such as F-measure, G-mean and H-mean.
Our core contribution is an adaptive linearization scheme for these families, using which we develop optimization techniques that enable truly point-based stochastic updates. For concave performance measures we propose \pdsgd, a stochastic primal dual solver; for pseudo-linear measures we propose \amsgd, a stochastic alternate maximization procedure. Both methods have crisp convergence guarantees, demonstrate significant speedups over existing methods - often by an order of magnitude or more, and give similar or more accurate predictions on test data.
@inproceedings{NarasimhanKJ15,
author = {Harikrishna Narasimhan and Purushottam Kar and Prateek Jain},
title = {Optimizing Non-decomposable Performance Measures: A Tale of Two Classes},
booktitle = {Proceedings of the 32nd International Conference on Machine Learning (ICML)},
year = {2015},
pages = {199--208},
url = {all_papers/NKJ15_ICML.pdf}
}
Fast Exact Matrix Completion with Finite Samples
Prateek Jain and Praneeth Netrapalli,
in Proceedings of The 28th Conference on Learning Theory (COLT),
2015.
[BibTeX]
[Abstract]
[URL]
Matrix completion is the problem of recovering a low rank matrix by observing a small fraction of its entries. A series of recent works \citep{Keshavan2012,JainNS2013,Hardt2013} have proposed fast non-convex optimization based iterative algorithms to solve this problem. However, the sample complexity in all these results is sub-optimal in its dependence on the rank, condition number and the desired accuracy.
In this paper, we present a fast iterative algorithm that solves the matrix completion problem by observing $\order{nr^5 \log^3 n}$ entries, which is independent of the condition number and the desired accuracy. The run time of our algorithm is $\order{nr^7\log^3 n\log 1/\epsilon}$ which is near linear in the dimension of the matrix. To the best of our knowledge, this is the first near linear time algorithm for exact matrix completion with finite sample complexity (i.e. independent of $\epsilon$).
Our algorithm is based on a well known projected gradient descent method, where the projection is onto the (non-convex) set of low rank matrices. There are two key ideas in our result: 1) our argument is based on a $\ell_\infty$ norm potential function (as opposed to the spectral norm) and provides a novel way to obtain perturbation bounds for it. 2) we prove and use a natural extension of the Davis-Kahan theorem to obtain perturbation bounds on the best low rank approximation of matrices with good eigen gap. Both of these ideas may be of independent interest.
@inproceedings{JN15,
author = {Prateek Jain and Praneeth Netrapalli},
title = {Fast Exact Matrix Completion with Finite Samples},
booktitle = {Proceedings of The 28th Conference on Learning Theory (COLT)},
year = {2015},
pages = {1007--1034},
url = {all_papers/JN15_COLT.pdf}
}
Tighter Low-rank Approximation via Sampling the Leveraged Element
Srinadh Bhojanapalli, Prateek Jain and Sujay Sanghavi,
in Proceedings of the Twenty-Sixth Annual ACM-SIAM Symposium on Discrete Algorithms (SODA),
2015.
[BibTeX]
[Abstract]
[URL]
In this work, we propose a new randomized algorithm for computing a low-rank approximation to a given matrix. Taking an approach different from existing literature, our method first involves a specific biased sampling, with an element being chosen based on the leverage scores of its row and column, and then involves weighted alternating minimization over the factored form of the intended low-rank matrix, to minimize error only on these samples. Our method can leverage input sparsity, yet produce approximations in {\em spectral} (as opposed to the weaker Frobenius) norm; this combines the best aspects of otherwise disparate current results, but with a dependence on the condition number $\kappa = \sigma_1/\sigma_r$. In particular we require $O(nnz(M) + \frac{n\kappa^2 r^5}{\epsilon^2} )$ computations to generate a rank-$r$ approximation to $M$ in spectral norm. In contrast, the best existing method requires $O(nnz(M)+ \frac{nr^2}{\epsilon^4})$ time to compute an approximation in Frobenius norm. Besides the tightness in spectral norm, we have a better dependence on the error $\epsilon$. Our method is naturally and highly parallelizable.
Our new approach enables two extensions that are interesting on their own. The first is a new method to directly compute a low-rank approximation (in efficient factored form) to the product of two given matrices; it computes a small random set of entries of the product, and then executes weighted alternating minimization (as before) on these. The sampling strategy is different because now we cannot access leverage scores of the product matrix (but instead have to work with input matrices). The second extension is an improved algorithm with smaller communication complexity for the distributed PCA setting (where each server has small set of rows of the matrix, and want to compute low rank approximation with small amount of communication with other servers).
@inproceedings{BhojanapalliJS15,
author = {Srinadh Bhojanapalli and Prateek Jain and Sujay Sanghavi},
title = {Tighter Low-rank Approximation via Sampling the Leveraged Element},
booktitle = {Proceedings of the Twenty-Sixth Annual ACM-SIAM Symposium on Discrete Algorithms (SODA)},
year = {2015},
pages = {902--920},
url = {all_papers/BJS15_SODA.pdf},
doi = {https://doi.org/10.1137/1.9781611973730.62}
}
Online and Stochastic Gradient Methods for Non-decomposable Loss Functions
Purushottam Kar, Harikrishna Narasimhan and Prateek Jain,
in Proceedings of the 27th Annual Conference on Advances in Neural Information Processing Systems (NIPS),
2014.
[BibTeX]
[Abstract]
[URL]
Modern applications in sensitive domains such as biometrics and medicine frequently require the use of \emph{non-decomposable} loss functions such as precision$@k$, F-measure etc. Compared to point loss functions such as hinge-loss, these offer much more fine grained control over prediction, but at the same time present novel challenges in terms of algorithm design and analysis. In this work we initiate a study of online learning techniques for such non-decomposable loss functions with an aim to enable incremental learning as well as design scalable solvers for batch problems. To this end, we propose an online learning framework for such loss functions. Our model enjoys several nice properties, chief amongst them being the existence of efficient online learning algorithms with sublinear regret and online to batch conversion bounds. Our model is a provable extension of existing online learning models for point loss functions. We instantiate two popular losses, \preck and pAUC, in our model and prove sublinear regret bounds for both of them. Our proofs require a novel structural lemma over ranked lists which may be of independent interest. We then develop scalable stochastic gradient descent solvers for non-decomposable loss functions. We show that for a large family of loss functions satisfying a certain uniform convergence property (that includes \preck, pAUC, and F-measure), our methods provably converge to the empirical risk minimizer. Such uniform convergence results were not known for these losses and we establish these using novel proof techniques. We then use extensive experimentation on real life and benchmark datasets to establish that our method can be orders of magnitude faster than a recently proposed cutting plane method.
@inproceedings{KarNJ14,
author = {Purushottam Kar and Harikrishna Narasimhan and Prateek Jain},
title = {Online and Stochastic Gradient Methods for Non-decomposable Loss Functions},
booktitle = {Proceedings of the 27th Annual Conference on Advances in Neural Information Processing Systems (NIPS)},
year = {2014},
pages = {694--702},
url = {all_papers/NKJ14_NIPS.pdf}
}
Non-convex Robust PCA
Praneeth Netrapalli, U. N. Niranjan, Sujay Sanghavi, Animashree Anandkumar and Prateek Jain,
in Advances in Neural Information Processing Systems 27: Annual Conference on Neural Information Processing Systems (NIPS),
2014.
[BibTeX]
[Abstract]
[URL]
@inproceedings{NetrapalliNSAJ14,
author = {Praneeth Netrapalli and U. N. Niranjan and Sujay Sanghavi and Animashree Anandkumar and Prateek Jain},
title = {Non-convex Robust PCA},
booktitle = {Advances in Neural Information Processing Systems 27: Annual Conference on Neural Information Processing Systems (NIPS)},
year = {2014},
pages = {1107--1115},
url = {http://papers.nips.cc/paper/5430-non-convex-robust-pca}
}
Universal Matrix Completion
Srinadh Bhojanapalli and Prateek Jain,
in Proceedings of the 31th International Conference on Machine Learning (ICML),
2014.
[BibTeX]
[Abstract]
[URL]
The problem of low-rank matrix completion has recently generated a lot of interest leading to several results that offer exact solutions to the problem. However, in order to do so, these methods make assumptions that can be quite restrictive in practice. More specifically, the methods assume that: a) the observed indices are sampled uniformly at random, and b) for every new matrix, the observed indices are sampled \emph{afresh}. In this work, we address these issues by providing a universal recovery guarantee for matrix completion that works for a variety of sampling schemes. In particular, we show that if the set of sampled indices come from the edges of a bipartite graph with large spectral gap (i.e. gap between the first and the second singular value), then the nuclear norm minimization based method exactly recovers all low-rank matrices that satisfy certain incoherence properties.
Moreover, we also show that under certain stricter incoherence conditions, $O(nr^2)$ uniformly sampled entries are enough to recover any rank-$r$ $n\times n$ matrix, in contrast to the $O(nr\log n)$ sample complexity required by other matrix completion algorithms as well as existing analyses of the nuclear norm method.
@inproceedings{BhojanapalliJ14,
author = {Srinadh Bhojanapalli and Prateek Jain},
title = {Universal Matrix Completion},
booktitle = {Proceedings of the 31th International Conference on Machine Learning (ICML)},
year = {2014},
pages = {1881--1889},
url = {all_papers/BJ14_ICML.pdf}
}
(Near) Dimension Independent Risk Bounds for Differentially Private Learning
Prateek Jain and Abhradeep Guha Thakurta,
in Proceedings of the 31th International Conference on Machine Learning (ICML),
2014.
[BibTeX]
[Abstract]
[URL]
@inproceedings{JainT14,
author = {Prateek Jain and Abhradeep Guha Thakurta},
title = {(Near) Dimension Independent Risk Bounds for Differentially Private Learning},
booktitle = {Proceedings of the 31th International Conference on Machine Learning (ICML)},
year = {2014},
pages = {476--484},
url = {all_papers/JT14_ICML.pdf}
}
Large-scale Multi-label Learning with Missing Labels
Hsiang-Fu Yu, Prateek Jain, Purushottam Kar and Inderjit S. Dhillon,
in Proceedings of the 31th International Conference on Machine Learning (ICML),
2014.
[BibTeX]
[Abstract]
[URL]
The multi-label classification problem has generated significant interest in recent Years. However, existing approaches do not adequately address two key challenges: (a) scaling up to problems with a large number (say millions) of labels, and (b) handling data with missing labels. In this paper, we directly address both these problems by studying the multi-label problem in a generic empirical risk minimization (ERM) framework. Our framework, despite being simple, is surprisingly able to encompass several recent label-compression based methods which can be derived as special cases of our method. To optimize the ERM problem, we develop techniques that exploit the structure of specific loss functions - such as the squared loss function - to obtain efficient algorithms. We further show that our learning framework admits excess risk bounds even in the presence of missing labels. Our bounds are tight and demonstrate better generalization performance for low-rank promoting trace-norm regularization when compared to (rank insensitive) Frobenius norm regularization. Finally, we present extensive empirical results on a variety of benchmark datasets and show that our methods perform significantly better than existing label compression based methods and can scale up to very large datasets such as a Wikipedia dataset that has more than 200,000 labels.
@inproceedings{YuJKD14,
author = {Hsiang-Fu Yu and Prateek Jain and Purushottam Kar and Inderjit S. Dhillon},
title = {Large-scale Multi-label Learning with Missing Labels},
booktitle = {Proceedings of the 31th International Conference on Machine Learning (ICML)},
year = {2014},
pages = {593--601},
url = {all_papers/YJKD14_ICML.pdf}
}
Learning Sparsely Used Overcomplete Dictionaries
Alekh Agarwal, Animashree Anandkumar, Prateek Jain, Praneeth Netrapalli and Rashish Tandon,
in Proceedings of The 27th Conference on Learning Theory (COLT),
2014.
[BibTeX]
[Abstract]
[URL]
We consider the problem of learning sparsely used overcomplete dictionaries, where each observation
is a sparse combination of elements from an unknown overcomplete dictionary. We establish
exact recovery when the dictionary elements are mutually incoherent. Our method consists of a
clustering-based initialization step, which provides an approximate estimate of the true dictionary
with guaranteed accuracy. This estimate is then refined via an iterative algorithm with the following
alternating steps: 1) estimation of the dictionary coefficients for each observation through $\ell_1$
minimization, given the dictionary estimate, and 2) estimation of the dictionary elements through
least squares, given the coefficient estimates. We establish that, under a set of sufficient conditions,
our method converges at a linear rate to the true dictionary as well as the true coefficients for each
observation.
@inproceedings{AgarwalA0NT14,
author = {Alekh Agarwal and Animashree Anandkumar and Prateek Jain and Praneeth Netrapalli and Rashish Tandon},
title = {Learning Sparsely Used Overcomplete Dictionaries},
booktitle = {Proceedings of The 27th Conference on Learning Theory (COLT)},
year = {2014},
pages = {123--137},
url = {all_papers/AAJNT14_COLT.pdf}
}
Learning Mixtures of Discrete Product Distributions using Spectral Decompositions
Prateek Jain and Sewoong Oh,
in Proceedings of The 27th Conference on Learning Theory (COLT),
2014.
[BibTeX]
[Abstract]
[URL]
We study the problem of learning a distribution from samples, when the underlying distribution is a mixture of product distributions over discrete domains. This problem is motivated by several practical applications such as crowdsourcing, recommendation systems, and learning Boolean functions. The existing solutions either heavily rely on the fact that the number of mixtures is finite or have sample/time complexity that is exponential in the number of mixtures. In this paper, we introduce a polynomial time/sample complexity method for learning a mixture of $r$ discrete product distributions over $\{1, 2, \dots, \ell\}^n$, for general $\ell$ and $r$. We show that our approach is consistent and further provide finite sample guarantees.
We use recently developed techniques from tensor decompositions for moment matching. A crucial step in these approaches is to construct certain tensors with low-rank spectral decompositions.These tensors are typically estimated from the sample moments. The main challenge in learning mixtures of discrete product distributions is that the corresponding low-rank tensors cannot be obtained directly from the sample moments. Instead, we need to estimate a low-rank matrix using only off-diagonal entries, and estimate a tensor using a few linear measurements. We give an alternating minimization based method to estimate the low-rank matrix, and formulate the tensor estimation problem as a least-squares problem.
@inproceedings{JainO14,
author = {Prateek Jain and Sewoong Oh},
title = {Learning Mixtures of Discrete Product Distributions using Spectral Decompositions},
booktitle = {Proceedings of The 27th Conference on Learning Theory (COLT)},
year = {2014},
pages = {824--856},
url = {all_papers/JO14_COLT.pdf}
}
Memory Limited, Streaming PCA
Ioannis Mitliagkas, Constantine Caramanis and Prateek Jain,
in Proceedings of the 27th Annual Conference on Neural Information Processing Systems (NIPS),
2013.
[BibTeX]
[Abstract]
[URL]
We consider streaming, one-pass principal component analysis (PCA), in the high-dimensional regime, with limited memory. Here, $p$-dimensional samples are presented sequentially, and the goal is to produce the $k$-dimensional subspace that best approximates these points. Standard algorithms require $O(p^2)$ memory; meanwhile no algorithm can do better than $O(kp)$ memory, since this is what the output itself requires. Memory (or storage) complexity is most meaningful when understood in the context of computational and sample complexity. Sample complexity for high-dimensional PCA is typically studied in the setting of the {\em spiked covariance model}, where $p$-dimensional points are generated from a population covariance equal to the identity (white noise) plus a low-dimensional perturbation (the spike) which is the signal to be recovered. It is now well-understood that the spike can be recovered when the number of samples, $n$, scales proportionally with the dimension, $p$. Yet, all algorithms that provably achieve this, have memory complexity $O(p^2)$. Meanwhile, algorithms with memory-complexity $O(kp)$ do not have provable bounds on sample complexity comparable to $p$. We present an algorithm that achieves both: it uses $O(kp)$ memory (meaning storage of any kind) and is able to compute the $k$-dimensional spike with $O(p \log p)$ sample-complexity -- the first algorithm of its kind. While our theoretical analysis focuses on the spiked covariance model, our simulations show that our algorithm is successful on much more general models for the data.
@inproceedings{MitliagkasC013,
author = {Ioannis Mitliagkas and Constantine Caramanis and Prateek Jain},
title = {Memory Limited, Streaming PCA},
booktitle = {Proceedings of the 27th Annual Conference on Neural Information Processing Systems (NIPS)},
year = {2013},
pages = {2886--2894},
url = {http://papers.nips.cc/paper/5035-memory-limited-streaming-pca}
}
Phase Retrieval using Alternating Minimization
Praneeth Netrapalli, Prateek Jain and Sujay Sanghavi,
in Proceedings of the 27th Annual Conference on Neural Information Processing Systems (NIPS),
2013.
[BibTeX]
[Abstract]
[URL]
Phase retrieval problems involve solving linear equations, but with missing sign (or phase, for complex numbers).
Over the last two decades, a popular generic empirical approach to the many variants of this problem has been one of alternating minimization;
i.e. alternating between estimating the missing phase information, and the candidate solution. In this paper, we show that a simple alternating
minimization algorithm geometrically converges to the solution of one such problem -- finding a vector $\vecfont{x}$ from $\vecfont{y},\mat{A}$,
where $\vecfont{y} = |\mat{A}^T\vecfont{x}|$ and $|\vecfont{z}|$ denotes a vector of element-wise magnitudes of $\vecfont{z}$ -- under the assumption that $\mat{A}$ is Gaussian.
Empirically, our algorithm performs similar to recently proposed convex techniques for this variant (which are based on ``lifting" to a convex matrix problem) in sample complexity and robustness to noise.
However, our algorithm is much more efficient and can scale to large problems.
Analytically, we show geometric convergence to the solution, and sample complexity that is off by log factors from obvious lower bounds.
We also establish close to optimal scaling for the case when the unknown vector is sparse. Our work represents the only known
theoretical guarantee for alternating minimization for any variant of phase retrieval problems in the non-convex setting.
@inproceedings{NetrapalliJS13,
author = {Praneeth Netrapalli and Prateek Jain and Sujay Sanghavi},
title = {Phase Retrieval using Alternating Minimization},
booktitle = {Proceedings of the 27th Annual Conference on Neural Information Processing Systems (NIPS)},
year = {2013},
pages = {2796--2804},
url = {all_papers/NJS13_NIPS.pdf}
}
One-Bit Compressed Sensing: Provable Support and Vector Recovery
Sivakant Gopi, Praneeth Netrapalli, Prateek Jain and Aditya V. Nori,
in Proceedings of the 30th International Conference on Machine Learning (ICML),
2013.
[BibTeX]
[Abstract]
[URL]
@inproceedings{GopiN0N13,
author = {Sivakant Gopi and Praneeth Netrapalli and Prateek Jain and Aditya V. Nori},
title = {One-Bit Compressed Sensing: Provable Support and Vector Recovery},
booktitle = {Proceedings of the 30th International Conference on Machine Learning (ICML)},
year = {2013},
pages = {154--162},
url = {http://jmlr.org/proceedings/papers/v28/gopi13.html}
}
Differentially Private Learning with Kernels
Prateek Jain and Abhradeep Thakurta,
in Proceedings of the 30th International Conference on Machine Learning (ICML),
2013.
[BibTeX]
[Abstract]
[URL]
@inproceedings{JainT13,
author = {Prateek Jain and Abhradeep Thakurta},
title = {Differentially Private Learning with Kernels},
booktitle = {Proceedings of the 30th International Conference on Machine Learning (ICML)},
year = {2013},
pages = {118--126},
url = {all_papers/JT13_ICML.pdf}
}
On the Generalization Ability of Online Learning Algorithms for Pairwise Loss Functions
Purushottam Kar, Bharath K. Sriperumbudur, Prateek Jain and Harish Karnick,
in Proceedings of the 30th International Conference on Machine Learning (ICML),
2013.
[BibTeX]
[Abstract]
[URL]
In this paper, we study the generalization properties of online learning based
stochastic methods for supervised learning problems where the loss function is
dependent on more than one training sample (e.g., metric learning, ranking). We
present a generic decoupling technique that enables us to provide Rademacher
complexity-based generalization error bounds. Our bounds are in general tighter
than those obtained by \citet{online-batch-pairwise} for the same problem. Using
our decoupling technique, we are further able to obtain fast convergence rates
for strongly convex pairwise loss functions. We are also able to analyze a class
of memory efficient online learning algorithms for pairwise learning problems that use
only a bounded subset of past training samples to update the hypothesis at each
step. Finally, in order to complement our generalization bounds, we propose a
novel memory efficient online learning algorithm for higher order learning
problems with bounded regret guarantees.
@inproceedings{KarS0K13,
author = {Purushottam Kar and Bharath K. Sriperumbudur and Prateek Jain and Harish Karnick},
title = {On the Generalization Ability of Online Learning Algorithms for Pairwise Loss Functions},
booktitle = {Proceedings of the 30th International Conference on Machine Learning (ICML)},
year = {2013},
pages = {441--449},
url = {all_papers/SKJK13_ICML.pdf}
}
Low-rank matrix completion using alternating minimization
Prateek Jain, Praneeth Netrapalli and Sujay Sanghavi,
in Proceedings of the Symposium on Theory of Computing Conference (STOC),
2013.
[BibTeX]
[Abstract]
[URL]
@inproceedings{JainNS13,
author = {Prateek Jain and Praneeth Netrapalli and Sujay Sanghavi},
title = {Low-rank matrix completion using alternating minimization},
booktitle = {Proceedings of the Symposium on Theory of Computing Conference (STOC)},
year = {2013},
pages = {665--674},
url = {http://doi.acm.org/10.1145/2488608.2488693},
doi = {https://doi.org/10.1145/2488608.2488693}
}
Ad impression forecasting for sponsored search
Abhirup Nath, Shibnath Mukherjee, Prateek Jain, Navin Goyal and Srivatsan Laxman,
in Proceedings of the 22nd International World Wide Web Conference (WWW),
2013.
[BibTeX]
[Abstract]
[URL]
A typical problem for a search engine (hosting sponsored search
service) is to provide the advertisers with a forecast of the number
of impressions his/her ad is likely to obtain for a given bid.
Accurate forecasts have high business value, since they enable advertisers
to select bids that lead to better returns on their investment.
They also play an important role in services such as automatic
campaign optimization. Despite its importance the problem
has remained relatively unexplored in literature. Existing methods
typically overfit to the training data, leading to inconsistent performance.
Furthermore, some of the existing methods cannot provide
predictions for new ads, i.e., for ads that are not present in
the logs. In this paper, we develop a generative model based approach
that addresses these drawbacks. We design a Bayes net to
capture inter-dependencies between the query traffic features and
the competitors in an auction. Furthermore, we account for variability
in the volume of query traffic by using a dynamic linear
model. Finally, we implement our approach on a production grade
MapReduce framework and conduct extensive large scale experiments
on substantial volumes of sponsored search data from Bing.
Our experimental results demonstrate significant advantages over
existing methods as measured using several accuracy/error criteria,
improved ability to provide estimates for new ads and more consistent
performance with smaller variance in accuracies. Our method
can also be adapted to several other related forecasting problems
such as predicting average position of ads or the number of clicks
under budget constraints.
@inproceedings{NathMJGL13,
author = {Abhirup Nath and Shibnath Mukherjee and Prateek Jain and Navin Goyal and Srivatsan Laxman},
title = {Ad impression forecasting for sponsored search},
booktitle = {Proceedings of the 22nd International World Wide Web Conference (WWW)},
year = {2013},
pages = {943--952},
url = {all_papers/NMJGL13_WWW.pdf}
}
Multilabel Classification using Bayesian Compressed Sensing
Ashish Kapoor, Raajay Viswanathan and Prateek Jain,
in Proceedings of the 26th Annual Conference on Neural Information Processing Systems (NIPS),
2012.
[BibTeX]
[Abstract]
[URL]
@inproceedings{KapoorVJ12,
author = {Ashish Kapoor and Raajay Viswanathan and Prateek Jain},
title = {Multilabel Classification using Bayesian Compressed Sensing},
booktitle = {Proceedings of the 26th Annual Conference on Neural Information Processing Systems (NIPS)},
year = {2012},
pages = {2654--2662},
url = {http://books.nips.cc/papers/files/nips25/NIPS2012_1243.pdf}
}
Supervised Learning with Similarity Functions
Purushottam Kar and Prateek Jain,
in Proceedings of the 26th Annual Conference on Neural Information Processing Systems (NIPS),
2012.
[BibTeX]
[Abstract]
[URL]
@inproceedings{KarJ12,
author = {Purushottam Kar and Prateek Jain},
title = {Supervised Learning with Similarity Functions},
booktitle = {Proceedings of the 26th Annual Conference on Neural Information Processing Systems (NIPS)},
year = {2012},
pages = {215--223},
url = {http://books.nips.cc/papers/files/nips25/NIPS2012_0123.pdf}
}
Improved multiple sequence alignments using coupled pattern mining
K. S. M. Tozammel Hossain, Debprakash Patnaik, Srivatsan Laxman, Prateek Jain, Chris Bailey-Kellogg and Naren Ramakrishnan,
in Proceedings of the ACM International Conference on Bioinformatics, Computational Biology and Biomedicine (BCB),
2012.
[BibTeX]
[Abstract]
[URL]
@inproceedings{HossainPLJBR12,
author = {K. S. M. Tozammel Hossain and Debprakash Patnaik and Srivatsan Laxman and Prateek Jain and Chris Bailey-Kellogg and Naren Ramakrishnan},
title = {Improved multiple sequence alignments using coupled pattern mining},
booktitle = {Proceedings of the ACM International Conference on Bioinformatics, Computational Biology and Biomedicine (BCB)},
year = {2012},
pages = {28--35},
url = {http://doi.acm.org/10.1145/2382936.2382940},
doi = {https://doi.org/10.1145/2382936.2382940}
}
Mirror Descent Based Database Privacy
Prateek Jain and Abhradeep Thakurta,
in Proceedings of the 16th International Workshop (RANDOM),
2012.
[BibTeX]
[Abstract]
[URL]
In this paper, we focus on the problem of private database
release in the interactive setting: a trusted database curator receives
queries in an online manner for which it needs to respond with accurate
but privacy preserving answers. To this end, we generalize the IDC (Iter-
ative Database Construction) framework of [HR10,GRU11] that maintains a dif-
ferentially private artificial dataset and answers incoming linear queries
using the artificial dataset. In particular, we formulate a generic IDC
framework based on the Mirror Descent algorithm, a popular convex
optimization algorithm. We then present two concrete applications,
namely, cut queries over a bipartite graph and linear queries over low-
rank matrices, and provide significantly tighter error bounds than the
ones by [HR10, GRU11].
@inproceedings{JainT12,
author = {Prateek Jain and Abhradeep Thakurta},
title = {Mirror Descent Based Database Privacy},
booktitle = {Proceedings of the 16th International Workshop (RANDOM)},
year = {2012},
pages = {579--590},
url = {http://dx.doi.org/10.1007/978-3-642-32512-0_49},
doi = {https://doi.org/10.1007/978-3-642-32512-0_49}
}
Differentially Private Online Learning
Prateek Jain, Pravesh Kothari and Abhradeep Thakurta,
in Proceedings of the 25th Annual Conference on Learning Theory (COLT),
2012.
[BibTeX]
[Abstract]
[URL]
@inproceedings{JainKT12,
author = {Prateek Jain and Pravesh Kothari and Abhradeep Thakurta},
title = {Differentially Private Online Learning},
booktitle = {Proceedings of the 25th Annual Conference on Learning Theory (COLT)},
year = {2012},
pages = {24.1--24.34},
url = {http://www.jmlr.org/proceedings/papers/v23/jain12/jain12.pdf}
}
Orthogonal Matching Pursuit with Replacement
Prateek Jain, Ambuj Tewari and Inderjit S. Dhillon,
in Proceedings of the 25th Annual Conference on Neural Information Processing Systems (NIPS),
2011.
[BibTeX]
[Abstract]
[URL]
@inproceedings{JainTD11,
author = {Prateek Jain and Ambuj Tewari and Inderjit S. Dhillon},
title = {Orthogonal Matching Pursuit with Replacement},
booktitle = {Proceedings of the 25th Annual Conference on Neural Information Processing Systems (NIPS)},
year = {2011},
pages = {1215--1223},
url = {http://books.nips.cc/papers/files/nips24/NIPS2011_0707.pdf}
}
Similarity-based Learning via Data Driven Embeddings
Purushottam Kar and Prateek Jain,
in Proceedings of the 25th Annual Conference on Neural Information Processing Systems (NIPS),
2011.
[BibTeX]
[Abstract]
[URL]
@inproceedings{KarJ11,
author = {Purushottam Kar and Prateek Jain},
title = {Similarity-based Learning via Data Driven Embeddings},
booktitle = {Proceedings of the 25th Annual Conference on Neural Information Processing Systems (NIPS)},
year = {2011},
pages = {1998--2006},
url = {http://books.nips.cc/papers/files/nips24/NIPS2011_1129.pdf}
}
Hashing Hyperplane Queries to Near Points with Applications to Large-Scale Active Learning
Prateek Jain, Sudheendra Vijayanarasimhan and Kristen Grauman,
in Proceedings of the 24th Annual Conference on Neural Information Processing Systems 2010 (NIPS),
2010.
[BibTeX]
[Abstract]
[URL]
@inproceedings{JainVG10,
author = {Prateek Jain and Sudheendra Vijayanarasimhan and Kristen Grauman},
title = {Hashing Hyperplane Queries to Near Points with Applications to Large-Scale Active Learning},
booktitle = {Proceedings of the 24th Annual Conference on Neural Information Processing Systems 2010 (NIPS)},
year = {2010},
pages = {928--936},
url = {http://books.nips.cc/papers/files/nips23/NIPS2010_0757.pdf}
}
Inductive Regularized Learning of Kernel Functions
Prateek Jain, Brian Kulis and Inderjit S. Dhillon,
in Proceedings of the 24th Annual Conference on Neural Information Processing Systems (NIPS),
2010.
[BibTeX]
[Abstract]
[URL]
@inproceedings{JainKD10,
author = {Prateek Jain and Brian Kulis and Inderjit S. Dhillon},
title = {Inductive Regularized Learning of Kernel Functions},
booktitle = {Proceedings of the 24th Annual Conference on Neural Information Processing Systems (NIPS)},
year = {2010},
pages = {946--954},
url = {http://books.nips.cc/papers/files/nips23/NIPS2010_0603.pdf}
}
Guaranteed Rank Minimization via Singular Value Projection
Prateek Jain, Raghu Meka and Inderjit S. Dhillon,
in Proceedings of the 24th Annual Conference on Neural Information Processing Systems (NIPS),
2010.
[BibTeX]
[Abstract]
[URL]
@inproceedings{JainMD10,
author = {Prateek Jain and Raghu Meka and Inderjit S. Dhillon},
title = {Guaranteed Rank Minimization via Singular Value Projection},
booktitle = {Proceedings of the 24th Annual Conference on Neural Information Processing Systems (NIPS)},
year = {2010},
pages = {937--945},
url = {http://books.nips.cc/papers/files/nips23/NIPS2010_0682.pdf}
}
Far-sighted active learning on a budget for image and video recognition
Sudheendra Vijayanarasimhan, Prateek Jain and Kristen Grauman,
in Proceedings of the Twenty-Third IEEE Conference on Computer Vision and Pattern Recognition (CVPR),
2010.
[BibTeX]
[Abstract]
[URL]
@inproceedings{VijayanarasimhanJG10,
author = {Sudheendra Vijayanarasimhan and Prateek Jain and Kristen Grauman},
title = {Far-sighted active learning on a budget for image and video recognition},
booktitle = {Proceedings of the Twenty-Third IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
year = {2010},
pages = {3035--3042},
url = {http://dx.doi.org/10.1109/CVPR.2010.5540055},
doi = {https://doi.org/10.1109/CVPR.2010.5540055}
}
Matrix Completion from Power-Law Distributed Samples
Raghu Meka, Prateek Jain and Inderjit S. Dhillon,
in Proceedings of the 23rd Annual Conference on Neural Information Processing Systems (NIPS),
2009.
[BibTeX]
[Abstract]
[URL]
@inproceedings{MekaJD09,
author = {Raghu Meka and Prateek Jain and Inderjit S. Dhillon},
title = {Matrix Completion from Power-Law Distributed Samples},
booktitle = {Proceedings of the 23rd Annual Conference on Neural Information Processing Systems (NIPS)},
year = {2009},
pages = {1258--1266},
url = {http://books.nips.cc/papers/files/nips22/NIPS2009_0864.pdf}
}
Geometry-aware metric learning
Zhengdong Lu, Prateek Jain and Inderjit S. Dhillon,
in Proceedings of the 26th Annual International Conference on Machine Learning (ICML),
2009.
[BibTeX]
[Abstract]
[URL]
@inproceedings{LuJD09,
author = {Zhengdong Lu and Prateek Jain and Inderjit S. Dhillon},
title = {Geometry-aware metric learning},
booktitle = {Proceedings of the 26th Annual International Conference on Machine Learning (ICML)},
year = {2009},
pages = {673--680},
url = {http://doi.acm.org/10.1145/1553374.1553461},
doi = {https://doi.org/10.1145/1553374.1553461}
}
Active learning for large multi-class problems
Prateek Jain and Ashish Kapoor,
in Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR),
2009.
[BibTeX]
[Abstract]
[URL]
@inproceedings{JainK09,
author = {Prateek Jain and Ashish Kapoor},
title = {Active learning for large multi-class problems},
booktitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR)},
year = {2009},
pages = {762--769},
url = {http://dx.doi.org/10.1109/CVPRW.2009.5206651},
doi = {https://doi.org/10.1109/CVPRW.2009.5206651}
}
Online Metric Learning and Fast Similarity Search
Prateek Jain, Brian Kulis, Inderjit S. Dhillon and Kristen Grauman,
in Proceedings of the Twenty-Second Annual Conference on Neural Information Processing Systems (NIPS),
2008.
[BibTeX]
[Abstract]
[URL]
@inproceedings{JainKDG08,
author = {Prateek Jain and Brian Kulis and Inderjit S. Dhillon and Kristen Grauman},
title = {Online Metric Learning and Fast Similarity Search},
booktitle = {Proceedings of the Twenty-Second Annual Conference on Neural Information Processing Systems (NIPS)},
year = {2008},
pages = {761--768},
url = {http://books.nips.cc/papers/files/nips21/NIPS2008_1003.pdf}
}
Rank minimization via online learning
Raghu Meka, Prateek Jain, Constantine Caramanis and Inderjit S. Dhillon,
in Proceedings of the Twenty-Fifth International Conference on Machine Learning (ICML),
2008.
[BibTeX]
[Abstract]
[URL]
@inproceedings{MekaJCD08,
author = {Raghu Meka and Prateek Jain and Constantine Caramanis and Inderjit S. Dhillon},
title = {Rank minimization via online learning},
booktitle = {Proceedings of the Twenty-Fifth International Conference on Machine Learning (ICML)},
year = {2008},
pages = {656--663},
url = {http://doi.acm.org/10.1145/1390156.1390239},
doi = {https://doi.org/10.1145/1390156.1390239}
}
Fast image search for learned metrics
Prateek Jain, Brian Kulis and Kristen Grauman,
in Proceedings of IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR),
2008.
[BibTeX]
[Abstract]
[URL]
@inproceedings{JainKG08,
author = {Prateek Jain and Brian Kulis and Kristen Grauman},
title = {Fast image search for learned metrics},
booktitle = {Proceedings of IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR)},
year = {2008},
url = {http://dx.doi.org/10.1109/CVPR.2008.4587841},
doi = {https://doi.org/10.1109/CVPR.2008.4587841}
}
Simultaneous Unsupervised Learning of Disparate Clusterings
Prateek Jain, Raghu Meka and Inderjit S. Dhillon,
in Proceedings of the SIAM International Conference on Data Mining (SDM),
2008.
[BibTeX]
[Abstract]
[URL]
@inproceedings{JainMD08,
author = {Prateek Jain and Raghu Meka and Inderjit S. Dhillon},
title = {Simultaneous Unsupervised Learning of Disparate Clusterings},
booktitle = {Proceedings of the SIAM International Conference on Data Mining (SDM)},
year = {2008},
pages = {858--869},
url = {http://dx.doi.org/10.1137/1.9781611972788.77},
doi = {https://doi.org/10.1137/1.9781611972788.77}
}
Information-theoretic metric learning
Jason V. Davis, Brian Kulis, Prateek Jain, Suvrit Sra and Inderjit S. Dhillon,
in Proceedings of the Twenty-Fourth International Conference on Machine Learning (ICML),
2007.
[BibTeX]
[Abstract]
[URL]
@inproceedings{DavisKJSD07,
author = {Jason V. Davis and Brian Kulis and Prateek Jain and Suvrit Sra and Inderjit S. Dhillon},
title = {Information-theoretic metric learning},
booktitle = {Proceedings of the Twenty-Fourth International Conference on Machine Learning (ICML)},
year = {2007},
pages = {209--216},
url = {http://doi.acm.org/10.1145/1273496.1273523},
doi = {https://doi.org/10.1145/1273496.1273523}
}