Clustering Algorithms

From NLPWiki

Jump to: navigation, search

Final Project (Projects #5 and #6)

This list is meant to give you ideas for clustering algorithms you could implement for Projects #5 and #6. You may also consider papers or ideas that are not on this list.

Remember to run your idea past Dr. Ringger before you submit your one-page paper proposal.

Have fun!

The List

  • Steinbach et al.: Bisecting K-Means
@techreport{skk-200,
  author =	 {Michael Steinbach and George Karypis and Bipin
                  Kumar},
  title =	 {A Comparison of Document Clustering Techniques},
  organization = {Department of Computer Science and Engineering}, institution = {University of Minnesota}, howpublished =
                  {http://www.cs.umn.edu/tech_reports_upload/tr2000/00-034.pdf},
year = 2000,
month = May,
annote	=	{Compares KMeans and a variation, called bisecting KMeans to
agglomerative clustering.  Separates
			clustering metrics into two categories: Internal and External, which are computed without
			and with gold-standard classifications, respectively.  External metrics used include the
			following: entropy, F-measure.  Internal metric used: overall similarity.}, }
  • Steinbach et al.: vector-space HAC
@techreport{skk-200,
  author =	 {Michael Steinbach and George Karypis and Bipin
                  Kumar},
  title =	 {A Comparison of Document Clustering Techniques},
  organization = {Department of Computer Science and Engineering}, institution = {University of Minnesota}, howpublished =
                  {http://www.cs.umn.edu/tech_reports_upload/tr2000/00-034.pdf},
year = 2000,
month = May,
annote	=	{Compares KMeans and a variation, called bisecting KMeans to
agglomerative clustering.  Separates
			clustering metrics into two categories: Internal and External, which are computed without
			and with gold-standard classifications, respectively.  External metrics used include the
			following: entropy, F-measure.  Internal metric used: overall similarity.}, }
  • Meila and Heckerman: probabilistic HAC
    • see course schedule for link to paper


  • Derivative idea from Baker & McCallum: EM document clustering iterated with distributional word clustering. Goal: fast clustering in small feature space!
    • see course schedule for link to paper


  • Combination of Blei et al. and Griffiths: LDA using Gibbs Sampling
    • see course schedule for link to paper


  • Spectral clustering
    • no specific reference -- suggest one.


  • Basu et al.: Constrained clustering
    • see course schedule for link to paper


  • Using WordNet to extend clustering
@inproceedings{hss-2003,
	author	=	{Andreas Hotho and Steffen Staab and Gerd Stumme},
	title	=	{Wordnet improves Text Document Clustering},
	booktitle = 	{"Proceedings of the Semantic Web Workshop at SIGIR-2003"},
	address	=	{},
}
  • Related to LDA
@inproceedings{sm-2006,
	 author = {M. Mahdi Shafiei and Evangelos E. Milios},
	 title = {Latent Dirichlet Co-Clustering},
	 booktitle = {ICDM '06: Proceedings of the Sixth International Conference on Data Mining},
	 year = {2006},
	 isbn = {0-7695-2701-9},
	 pages = {542--551},
	 doi = {http://dx.doi.org/10.1109/ICDM.2006.94},
	 publisher = {IEEE Computer Society},
	 address = {Washington, DC, USA},
}
  • Restrictive clustering
@inproceedings{ss-04,
	author = {Stefan Siersdorfer and Sergej Sizov},
	title = {Restrictive clustering and metaclustering for self-organizing document collections},
	booktitle = {SIGIR '04: Proceedings of the 27th annual international ACM SIGIR conference on Research and development in information retrieval},
	year = {2004},
	isbn = {1-58113-881-4},
	pages = {226--233},
	location = {Sheffield, United Kingdom},
	doi = {http://doi.acm.org/10.1145/1008992.1009032},
	publisher = {ACM Press},
	address = {New York, NY, USA},
}
  • Evolutionary clustering
@inproceedings{ckt-2006,
	author = {D Chakrabarti and R Kumar and A Tomkins},
	title = {Evolutionary Clustering},
	booktitle = {Proceedings of the 12th ACM SIGKDD international conference},
	year = 2006
}
  • Novelty detection
@inproceedings{zgy,
	author = {Jian Zhang and Zoubin Ghahramani and Yiming Yang},
	title = {A probabilistic model for online document clustering with application to novelty detection},
	booktitle = {Advances in Neural Information Processing Systems},
	pages = {1617-1624},
	publisher = {MIT Press}
	location = {Cambridge, MA}
	year = 2005,
}
  • Adaptive sub-space iteration
@inproceedings{lmo-04,
	author = {Tao Li and Sheng Ma and Mitsunori Ogihara},
	title = {Document clustering via adaptive subspace iteration},
	booktitle = {SIGIR '04: Proceedings of the 27th annual international ACM SIGIR conference on Research and development in information retrieval},
	year = {2004},
	isbn = {1-58113-881-4},
	pages = {218--225},
	location = {Sheffield, United Kingdom},
	doi = {http://doi.acm.org/10.1145/1008992.1009031},
	publisher = {ACM Press},
	address = {New York, NY, USA},
}
  • Concept factgorization
@inproceedings{xg-04,
	author = {Wei Xu and Yihong Gong},
	title = {Document clustering by concept factorization},
	booktitle = {SIGIR '04: Proceedings of the 27th annual international ACM SIGIR conference on Research and development in information retrieval},
	year = {2004},
	isbn = {1-58113-881-4},
	pages = {202--209},
	location = {Sheffield, United Kingdom},
	doi = {http://doi.acm.org/10.1145/1008992.1009029},
	publisher = {ACM Press},
	address = {New York, NY, USA},
}
Personal tools