@TECHREPORT{Fodor02, AUTHOR = {Imola K. Fodor}, TITLE = {A survey of dimension reduction techniques}, INSTITUTION = {Center for Applied Scientific Computing, Lawrence Livermore National Laboratory}, YEAR = {2002}, type = {Technical Report}, number = {UCRL-ID-148494}, month = {June} } @INCOLLECTION{Husbands2001, author = {Parry Husbands and Horst Simon and Chris H. Q. Ding}, TITLE = {On the use of the singular value decomposition for text retrieval}, BOOKTITLE = {Computational information retrieval}, PUBLISHER = {Society for Industrial and Applied Mathematics}, YEAR = {2001}, isbn = {0-89871-500-8}, pages = {145--156}, address = {Philadelphia, PA, USA} } @TECHREPORT{zhao01criterion, AUTHOR = {Y. Zhao and G. Karypis}, TITLE = {Criterion functions for document clustering: Experiments and analysis}, INSTITUTION = {Department of Computer Science, University of Minnesota}, YEAR = {2001}, type = {Technical Report}, number = {TR \#01--40}, address = {Minneapolis, MN} } @INPROCEEDINGS{Milios_Zhang, AUTHOR = {E. Milios and Y. Zhang and N. Zincir-Heywood}, TITLE = {Term-based clustering and summarization of web page collections}, BOOKTITLE = {the Seventeenth Conference of the Canadian Society for Computational Studies of Intelligence (AI’04)}, YEAR = {2004}, pages = {60-74}, address = {London, ON}, month = {May} } @inproceedings{Cavnar94, author = {William B. Cavnar}, title = {Using An N-Gram-Based Document Representation With A Vector Processing Retrieval Model.}, booktitle = {TREC}, year = {1994}, pages = {269-278}, bibsource = {DBLP, http://dblp.uni-trier.de} } @MISC{Vlado_NGram, author = {Vlado Keselj}, title = {Perl package Text::Ngrams}, year = {2004}, source = {http://users.cs.dal.ca/$\sim$vlado/srcperl/Ngrams/} } @TECHREPORT{Bin2004, AUTHOR = {B. Tang and X. Luo and M. I. Heywood and M. Shepherd }, TITLE = {Comparative Study of Dimension Reduction Techniques for Document Clustering}, INSTITUTION = {Faculty of Computer Science, Dalhousie University}, YEAR = {2004}, type = {Technical Report}, number = {CS-2004-14}, month = {December}, } @article{JainMF99, author = {Anil K. Jain and M. Narasimha Murty and Patrick J. Flynn}, title = {Data Clustering: A Review.}, journal = {ACM Comput. Surv.}, volume = {31}, number = {3}, year = {1999}, pages = {264-323}, } @techreport{berkhin02survey, author = "Pavel Berkhin", title = "Survey Of Clustering Data Mining Techniques", institution = "Accrue Software", address = "San Jose, CA", year = "2002", } @INPROCEEDINGS{Beyer99, AUTHOR = {K. Beyer and J. Goldstein and R. Ramakrishnan and and U. Shaft}, TITLE = {When is the Nearest Neighbour Meaningful?}, BOOKTITLE = {Proceedings of the 7th International Conference on Database Theory}, YEAR = {1999}, pages = {217-235}, } @article{blum97, author = "Avrim Blum and Pat Langley", title = "Selection of Relevant Features and Examples in Machine Learning", journal = "Artificial Intelligence", volume = "97", number = "1-2", pages = "245-271", year = "1997", } @ARTICLE{Pars-etal04b, AUTHOR = {Lance Parsons and Ehtesham Haque and Huan Liu}, TITLE = {Subspace Clustering for High Dimensional Data: A Review}, JOURNAL = {SIGKDD Explorations, Newsletter of the ACM Special Interest Group on Knowledge Discovery and Data Mining}, YEAR = {2004}, } @inproceedings{yang97, author = "Yiming Yang and Jan O. Pedersen", title = "A comparative study on feature selection in text categorization", booktitle = "Proceedings of {ICML}-97, 14th International Conference on Machine Learning", publisher = "Morgan Kaufmann Publishers, San Francisco, US", address = "Nashville, US", editor = "Douglas H. Fisher", pages = "412--420", year = "1997", } @incollection{Kolenda99, author = "T. Kolenda and L.K. Hansen and S. Sigurdsson", editor = "M. Girolami", title = "Independent Components in Text", booktitle = "Advances in Independent Component Analysis", publisher = "Springer-Verlag", pages = "229-250", year = "2000", } @article{Salton88, author = {Gerard Salton and Christopher Buckley}, title = {Term-weighting approaches in automatic text retrieval}, journal = {Inf. Process. Manage.}, volume = {24}, number = {5}, year = {1988}, issn = {0306-4573}, pages = {513--523}, publisher = {Pergamon Press, Inc.}, address = {Tarrytown, NY, USA}, } @article{deerwester90, author = "Scott C. Deerwester and Susan T. Dumais and Thomas K. Landauer and George W. Furnas and Richard A. Harshman", title = "Indexing by Latent Semantic Analysis", journal = "Journal of the American Society of Information Science", volume = "41", number = "6", pages = "391-407", year = "1990", } @INPROCEEDINGS{Jung01, AUTHOR = {Andreas Jung}, TITLE = {An introduction to a new data analysis tool: Independent Component Analysis}, BOOKTITLE = {Proceedings of Workshop GK "Nonlinearity"}, YEAR = {2001}, address = {Regensburg}, month = {October}, } @INPROCEEDINGS{Milios2003-2, AUTHOR = "E. Milios and Y. Zhang and B. He and L. Dong", TITLE = "Automatic Term Extraction and Document Similarity in Special Text Corpora", BOOKTITLE = "Proceedings of the 6th Conference of the Pacific Association for Computational Linguistics (PACLing'03)", YEAR = "2003", pages = "275-284", address = "Halifax, Nova Scotia, Canada", month = "August 22-25" } @article{Sebastiani02, author = {Fabrizio Sebastiani}, title = {Machine learning in automated text categorization}, journal = {ACM Comput. Surv.}, volume = {34}, number = {1}, year = {2002}, issn = {0360-0300}, pages = {1--47}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{fuhr91, author = "Norbert Fuhr and Stephan Hartmann and Gerhard Knorz and Gerhard Lustig and Michael Schwantner and Konstadinos Tzeras", title = "{AIR/X} -- a Rule-Based Multistage Indexing System for Large Subject Fields", booktitle = "Proceedings of {RIAO}-91, 3rd International Conference ``Recherche d'Information Assistee par Ordinateur''", publisher = "Elsevier Science Publishers, Amsterdam, NL", address = "Barcelona, ES", editor = "Andr{\'e} Lichnerowicz", pages = "606--623", year = "1991", } @inproceedings{schutze95, author = "Hinrich Schutze and David A. Hull and Jan O. Pedersen", title = "A Comparison of Classifiers and Document Representations for the Routing Problem", booktitle = "Research and Development in Information Retrieval", pages = "229-237", year = "1995", } @inproceedings{Tzeras93, author = {Kostas Tzeras and Stephan Hartmann}, title = {Automatic indexing based on Bayesian inference networks}, booktitle = {SIGIR '93: Proceedings of the 16th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {1993}, isbn = {0-89791-605-0}, pages = {22--35}, location = {Pittsburgh, Pennsylvania, United States}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{Dhillon04, author = {Arindam Banerjee and Inderjit Dhillon and Joydeep Ghosh and Srujana Merugu and Dharmendra S. Modha}, title = {A generalized maximum entropy approach to bregman co-clustering and matrix approximation}, booktitle = {KDD '04: Proceedings of the 2004 ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, year = {2004}, isbn = {1-58113-888-9}, pages = {509--514}, location = {Seattle, WA, USA}, publisher = {ACM Press}, address = {New York, NY, USA} } @UNPUBLISHED{Lerman1999, AUTHOR = {Kristina Lerman}, TITLE = {Document Clustering in Reduced Dimension Vector Space}, NOTE = {http://www.isi.edu/$\sim$lerman/papers/papers.html}, year = {1999} } @ARTICLE{Yeung2001, AUTHOR = {K. Yeung and W. Ruzzo}, TITLE = {Principal component analysis for clustering gene expression data}, JOURNAL = {Bioinformatics}, YEAR = {2001}, volume = {17}, number = {9}, pages = {763-774}, month = {September} } @INPROCEEDINGS{Steinbach2000, AUTHOR = {Michael Steinbach and George Karypis and Vipin Kumar}, TITLE = {A Comparison of Common Document Clustering Techniques}, BOOKTITLE = {KDD Workshop on Text Mining}, YEAR = {2000} } @INPROCEEDINGS{Beil2002, AUTHOR = {F. Beil and M. Ester and X. Xu}, TITLE = {Frequent term-based text clustering}, BOOKTITLE = {Proc. 8th Int. Conf. on Knowledge Discovery and Data Mining (KDD)}, address = {Edmonton, Alberta}, YEAR = {2002} } @INPROCEEDINGS{Slonim2000, AUTHOR = {N. Slonim and N. Tishby}, TITLE = {Document Clustering Using Word Clusters via the Information Bottleneck Method}, BOOKTITLE = {23rd Annual International ACM SIGIR Conference}, YEAR = {2000} } @INPROCEEDINGS{Dhillon2003, AUTHOR = {I. Dhillon and S. Mallela and D. Modha}, TITLE = {Information Theoretic co-clustering}, BOOKTITLE = {Proceedings of The Ninth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining(KDD)}, YEAR = {2003}, month = {August}, pages = {89-98}, address = {Washington, DC} } @TECHREPORT{LSI-TR, AUTHOR = "Mahdi Shafiei and Singer Wang and Roger Zhang and Evangelos Milios and Bin Tang and Jane Tougas and Ray Spiteri", TITLE = "A Systematic Study of Document Representation and Dimension Reduction for Text Clustering", INSTITUTION = "Faculty of Computer Science, Dalhousie University", YEAR = "2006", type = "Technical Report", address = "Halifax, Canada, http://www.cs.dal.ca/research/techreports/2006/", month = "July", } % number = "", @ARTICLE{Halkidi2001, AUTHOR = {M. Halkidi and Y. Batistakis and M. Vazirgiannis}, TITLE = {On Clustering Validation Techniques}, JOURNAL = {Journal of Intelligent Information Systems,}, YEAR = {2001}, volume = {17}, number = {2/3}, pages = {107-145} } @INPROCEEDINGS{Xiong2006, AUTHOR = {H. Xiong and J. Wu and J. Chen}, TITLE = {Kmeans Clustering versus Validation Measures A Data Distribution Perspective}, BOOKTITLE = {KDD}, YEAR = {2006}, month = {Aug. 20-23}, address = {Philadelphia, PA, USA}, publisher = {ACM} }