@article{Elmagarmid07, author = {Elmagarmid, Ahmed K. and Ipeirotis, Panagiotis G. and Verykios, Vassilios S.}, title = {Duplicate Record Detection: A Survey}, journal = {IEEE Transactions on Knowledge and Data Engineering}, volume = {19}, number = {1}, year = {2007}, issn = {1041-4347}, pages = {1--16}, doi = {http://dx.doi.org/10.1109/TKDE.2007.9}, publisher = {IEEE Educational Activities Department}, address = {Piscataway, NJ, USA}, } @inproceedings{Carvalho08b, author = {de Carvalho, Moisés G. and Laender, Alberto H. F. and Gon\c{c}alves, Marcos André and Porto, Thiago C.}, title = {The Impact of Parameter Setup on a Genetic Programming Approach to Record Deduplication}, booktitle = {Proceedings of the 23rd Brazilian Symposium on Databases}, year = {2008}, pages = {91--105}, address = {Campinas, SP, Brazil}, } @inproceedings{Carvalho06, author = {de Carvalho, Moisés G. and Gon\c{c}alves, Marcos André and Laender, Alberto H. F. and da Silva, Altigran S.}, title = {Learning to Deduplicate}, booktitle = {Proceedings of the Sixth ACM/IEEE-CS Joint Conference on Digital Libraries}, year = {2006}, isbn = {1-59593-354-9}, pages = {41--50}, address = {Chapel Hill, NC, USA}, doi = {http://doi.acm.org/10.1145/1141753.1141760}, } @article{BilenkoEtAl03, author = {Bilenko, Mikhail and Mooney, Raymond and Cohen, William and Ravikumar, Pradeep and Fienberg, Stephen}, title = {Adaptive Name Matching in Information Integration}, journal = {IEEE Intelligent Systems}, volume = {18}, number = {5}, year = {2003}, issn = {1541-1672}, pages = {16--23}, doi = {http://dx.doi.org/10.1109/MIS.2003.1234765}, publisher = {IEEE Educational Activities Department}, address = {Piscataway, NJ, USA}, } @inproceedings{BilenkoMooney03, author = {Bilenko, Mikhail and Mooney, Raymond J.}, title = {Adaptive Duplicate Detection Using Learnable String Similarity Measures}, booktitle = {Proceedings of the Ninth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, year = {2003}, isbn = {1-58113-737-0}, pages = {39--48}, address = {Washington, DC, USA}, doi = {http://doi.acm.org/10.1145/956750.956759}, } @book{Banzhaf98, author = {Banzhaf, Wolfgang and Francone, Frank D. and Keller, Robert E. and Nordin, Peter}, title = {Genetic Programming: An Introduction on the Automatic Evolution of Computer Programs and Its Applications}, year = {1998}, isbn = {1-55860-510-X}, publisher = {Morgan Kaufmann Publishers Inc.}, address = {San Francisco, CA, USA}, } @techreport{BellDravis06, author = {Bell, Royce and Dravis, Frank}, title = {Is Your Data Dirty?: (And Does That Matter?)}, year = {2006}, institution = {Accenture Whiter Paper}, note = {Available at http://www.accenture.com}, } @inproceedings{Bhattacharya04, author = {Bhattacharya, Indrajit and Getoor, Lise}, title = {Iterative Record Linkage for Cleaning and Integration}, booktitle = {Proceedings of the Ninth ACM SIGMOD Workshop on Research Issues in Data Mining and Knowledge Discovery}, year = {2004}, pages = {11--18}, address = {Paris, France}, } @inproceedings{Chaudhuri03, author = {Chaudhuri, Surajit and Ganjam, Kris and Ganti, Venkatesh and Motwani, Rajeev}, title = {Robust and Efficient Fuzzy Match for Online Data Cleaning}, booktitle = {Proceedings of the 2003 ACM SIGMOD International Conference on Management of Data}, year = {2003}, isbn = {1-58113-634-X}, pages = {313--324}, address = {San Diego, CA, USA}, doi = {http://doi.acm.org/10.1145/872757.872796}, } @inproceedings{Carvalho08a, author = {de Carvalho, Moisés G. and Laender, Alberto H. F. and Gon\c{c}alves, Marcos André and da Silva, Altigran S.}, title = {Replica Identification Using Genetic Programming}, booktitle = {Proceedings of the 2008 ACM Symposium on Applied Computing}, year = {2008}, isbn = {978-1-59593-753-7}, pages = {1801--1806}, address = {Fortaleza, CE, Brazil}, doi = {http://doi.acm.org/10.1145/1363686.1364118}, } @article{Fellegi69, author = {Fellegi, Ivan P. and Sunter, Alan B.}, citeulike-article-id = {590229}, citeulike-linkout-0 = {http://dx.doi.org/10.2307/2286061}, citeulike-linkout-1 = {http://www.jstor.org/stable/2286061}, doi = {10.2307/2286061}, journal = {Journal of the American Statistical Association}, keywords = {record\_linkage}, number = {328}, pages = {1183--1210}, posted-at = {2006-04-18 08:45:58}, priority = {2}, title = {A Theory for Record Linkage}, url = {http://dx.doi.org/10.2307/2286061}, volume = {64}, year = {1969}, } @inproceedings{Koudas06, author = {Koudas, Nick and Sarawagi, Sunita and Srivastava, Divesh}, title = {Record Linkage: Similarity Measures and Algorithms}, booktitle = {Proceedings of the 2006 ACM SIGMOD International Conference on Management of Data}, year = {2006}, isbn = {1-59593-434-0}, pages = {802--803}, address = {Chicago, IL, USA}, doi = {http://doi.acm.org/10.1145/1142473.1142599}, } @book{Koza92, address = {Cambridge, MA}, author = {Koza, J. R.}, citeulike-article-id = {1505719}, keywords = {bibtex-import}, posted-at = {2007-07-26 20:53:07}, priority = {0}, publisher = {The MIT Press}, title = {Genetic Programming: On the Programming of Computers By Means of Natural Selection}, year = {1992}, } @article{Verykios03, author = {Verykios, V. S. and Moustakides, G. V. and Elfeky, M. G.}, title = {A Bayesian Decision Model for Cost Optimal Record Matching}, journal = {The VLDB Journal}, volume = {12}, number = {1}, year = {2003}, issn = {1066-8888}, pages = {28--40}, doi = {http://dx.doi.org/10.1007/s00778-002-0072-y}, publisher = {Springer-Verlag New York, Inc.}, address = {Secaucus, NJ, USA}, } @techreport{Wheatley04, author = {Wheatley, M}, title = {Operation Clean Data}, year = {2004}, month = {August}, institution = {CIO Asia Magazine}, note = {Available at http://www.cio-asia.com}, pubcat = {techreport}, } @book{Jain91, author = {Jain, R. K.}, citeulike-article-id = {5190414}, day = {30}, howpublished = {Hardcover}, isbn = {0471503363}, keywords = {allocation, resource}, month = {April}, posted-at = {2009-07-17 09:36:14}, priority = {2}, publisher = {Wiley}, address = {New York, NY, USA}, title = {The Art of Computer Systems Performance Analysis: Techniques for Experimental Design, Measurement, Simulation, and Modeling}, url = {http://www.amazon.com/exec/obidos/redirect?tag=citeulike07-20&path=ASIN/0471503363}, year = {1991}, } @article{GuBaxter06, author = {Gu, Lifang and Baxter, Rohan}, citeulike-article-id = {2200851}, doi = {10.1007/11677437_12}, journal = {Selected Papers from Australasian Data Mining Conference}, volume = {3755}, pages = {146--160}, posted-at = {2008-01-06 20:31:06}, priority = {2}, title = {Decision Models for Record Linkage}, url = {http://dx.doi.org/10.1007/11677437_12}, year = {2006}, } @inproceedings{Christen05, author = {Peter Christen}, title = {Probabilistic Data Generation for Deduplication and Data Linkage}, booktitle = {Proceedings of Intelligent Data Engineering and Automated Learning}, year = {2005}, pages = {109-116}, ee = {http://dx.doi.org/10.1007/11508069_15}, crossref = {DBLP:conf/ideal/2005}, bibsource = {DBLP, http://dblp.uni-trier.de} } @proceedings{DBLP:conf/ideal/2005, editor = {Marcus Gallagher and James M. Hogan and Fr{\'e}d{\'e}ric Maire}, title = {Intelligent Data Engineering and Automated Learning - IDEAL 2005, 6th International Conference, Brisbane, Australia, July 6-8, 2005, Proceedings}, booktitle = {IDEAL}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, volume = {3578}, year = {2005}, isbn = {3-540-26972-X}, bibsource = {DBLP, http://dblp.uni-trier.de} } @book{BaezaBerthier99, author = {Baeza-Yates, Ricardo A. and Ribeiro-Neto, Berthier}, title = {Modern Information Retrieval}, year = {1999}, isbn = {020139829X}, publisher = {Addison-Wesley Longman Publishing Co., Inc.}, address = {Boston, MA, USA}, } @article{Newcombe59, title = {Automatic Linkage of Vital Records}, author = {Howard B. Newcombe and James M. Kennedy and S.J. Axford and A.P. James}, journal = {Science}, number = 3381, pages = {954--959}, volume = 130, year = 1959, month = {October}, biburl = {http://www.bibsonomy.org/bibtex/2613c244f0c03d065f214a0fb6a886f48/pirot}, keywords = {imported}, } @inproceedings{Tejada02, author = {Tejada, Sheila and Knoblock, Craig A. and Minton, Steven}, title = {Learning Domain-Independent String Transformation Weights for High Accuracy Object Identification}, booktitle = {Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, year = {2002}, isbn = {1-58113-567-X}, pages = {350--359}, address = {Edmonton, AB, Canada}, doi = {http://doi.acm.org/10.1145/775047.775099}, } @article{Tejada01, author = {Tejada, Sheila and Knoblock, Craig A. and Minton, Steven}, title = {Learning Object Identification Rules for Information Integration}, journal = {Information Systems}, volume = {26}, number = {8}, year = {2001}, issn = {0306-4379}, pages = {607--633}, doi = {http://dx.doi.org/10.1016/S0306-4379(01)00042-4}, publisher = {Elsevier Science Ltd.}, address = {Oxford, UK, UK}, } @article{Cohen00, author = {Cohen, William W.}, title = {Data Integration Using Similarity Joins and a Word-Based Information Representation Language}, journal = {ACM Transactions on Information Systems}, volume = {18}, number = {3}, year = {2000}, issn = {1046-8188}, pages = {288--321}, doi = {http://doi.acm.org/10.1145/352595.352598}, publisher = {ACM}, address = {New York, NY, USA}, } @inproceedings{Christen08, author = {Christen, Peter}, title = {Febrl: a Freely Available Record Linkage System with a Graphical User Interface}, booktitle = {Proceedings of the Second Australasian Workshop on Health Data and Knowledge Management}, year = {2008}, isbn = {978-1-920682-61-3}, pages = {17--25}, address = {Wollongong, NSW, Australia}, } @inproceedings{Goncalves09, author = {Gabriel Silva Gon\c{c}alves and Moisés G. de Carvalho and Alberto H. F. Laender and Marcos André Gon\c{c}alves}, title = {Sele\c{c}ão Automática de Exemplos de Treino para um Método de Deduplica\c{c}ão de Registros baseado em Programa\c{c}ão Genética}, booktitle = {XXIV Simpósio Brasileiro de Banco de Dados}, year = {2009}, pages = {76-90}, address = {Fortaleza, CE, Brasil}, ee = {http://www.lbd.dcc.ufmg.br:8080/colecoes/sbbd/2009/006.pdf}, } @book{Joachims02, author = {Joachims, Thorsten}, title = {Learning to Classify Text Using Support Vector Machines: Methods, Theory and Algorithms}, year = {2002}, isbn = {079237679X}, publisher = {Kluwer Academic Publishers}, address = {Norwell, MA, USA}, } @article{Geer08, author = {Geer, David}, title = {Reducing the Storage Burden via Data Deduplication}, journal = {IEEE Computer}, volume = {41}, number = {12}, year = {2008}, issn = {0018-9162}, pages = {15--17}, doi = {http://dx.doi.org/10.1109/MC.2008.502}, publisher = {IEEE Computer Society Press}, address = {Los Alamitos, CA, USA}, } @TECHREPORT{Winkler99, author = {William E. Winkler}, title = {The State of Record Linkage and Current Research Problems}, institution = {Statistical Research Division, U.S. Census Bureau}, year = {1999} } @book{Salton89, author = {Salton, Gerard}, title = {Automatic Text Processing: The Transformation, Analysis, and Retrieval of Information by Computer}, year = {1989}, isbn = {0-201-12227-8}, publisher = {Addison-Wesley Longman Publishing Co., Inc.}, address = {Boston, MA, USA}, } @inproceedings{CarvalhoS03, author = {Carvalho, Joyce C. P. and da Silva, Altigran S.}, title = {Finding Similar Identities among Objects from Multiple Web Sources}, booktitle = {Proceedings of the Fifth ACM International Workshop on Web Information and Data Management}, year = {2003}, isbn = {1-58113-725-7}, pages = {90--93}, location = {New Orleans, Louisiana, USA}, doi = {http://doi.acm.org/10.1145/956699.956719}, publisher = {ACM}, address = {New York, NY, USA}, }