@article {allen_et_al:DM:2017:7146,
	title = {Engineering Academic Software (Dagstuhl Perspectives Workshop 16252)},
	journal = {Dagstuhl Manifestos},
	volume = {6},
	number = {1},
	year = {2017},
	month = {01/2017},
	pages = {1{\textendash}20},
	publisher = {Schloss Dagstuhl{\textendash}Leibniz-Zentrum fuer Informatik},
	address = {Dagstuhl, Germany},
	abstract = {Software is often a critical component of scientific research. It can be a component of the academic research methods used to produce research results, or it may itself be an academic research result. Software, however, has rarely been considered to be a citable artifact in its own right. With the advent of open-source software, artifact evaluation committees of conferences, and journals that include source code and running systems as part of the published artifacts, we foresee that software will increasingly be recognized as part of the academic process. The quality and sustainability of this software must be accounted for, both a prioro and a posteriori. The Dagstuhl Perspectives Workshop on "Engineering Academic Software" has examined the strengths, weaknesses, risks, and opportunities of academic software engineering. A key outcome of the workshop is this Dagstuhl Manifesto, serving as a roadmap towards future professional software engineering for software-based research instruments and other software produced and used in an academic context. The manifesto is expressed in terms of a series of actionable "pledges" that users and developers of academic research software can take as concrete steps towards improving the environment in which that software is produced.},
	issn = {2193-2433},
	doi = {10.4230/DagMan.6.1.1},
	url = {http://drops.dagstuhl.de/opus/volltexte/2017/7146},
	author = {Alice Allen and Cecilia Aragon and Christoph Becker and Jeffrey Carver and Andrei Chis and Benoit Combemale and Mike Croucher and Kevin Crowston and Daniel Garijo and Ashish Gehani and Carole Goble and Robert Haines and Robert Hirschfeld and James Howison and Kathryn Huff and Caroline Jay and Daniel S. Katz and Claude Kirchner and Katie Kuksenok and Ralf L{\"a}mmel and Oscar Nierstrasz and Matt Turk and Rob van Nieuwpoort and Matthew Vaughn and Jurgen J. Vinju},
	editor = {Alice Allen and et al}
}
@article {370,
	title = {The Role and Relevance of Experimentation in Informatics},
	year = {2012},
	month = {11/2012},
	institution = {Informatics Europe},
	abstract = {Informatics is a relatively young field within science and engineering. Its research and development methodologies build on the scientific and design methodologies in the classical areas, often with new elements to it. We take an in-depth look at one of the less well-understood methodologies in informatics, namely experimentation.

What does it mean to do experiments in informatics? Does it make sense to {\textquoteleft}import{\textquoteright} traditional principles of experimentation from classical disciplines into the field of computing and information processing? How should experiments be documented? These are some of the questions that are treated.

The report argues for the key role of empirical research and experimentation in contemporary Informatics. Many IT systems, large and small, can only be designed sensibly with the help of experiments. We recommend that professionals and students alike are well-educated in the principles of sound experimentation in Informatics. We also recommend that experimentation protocols are used and standardized as part of the experimental method in Informatics.},
	url = {http://www.informatics-europe.org/images/documents/informatics-experimentation_2013.pdf},
	author = {Carlos Andujar and Viola Schiaffonati and Fabio A. Schreiber and Letizia Tanca and Matti Tedre and Kees van Hee and Jan van Leeuwen}
}
@article {363,
	title = {Relevance relations for the concept of reproducibility},
	journal = {J. R. Soc. Interface},
	volume = {11},
	year = {2014},
	month = {05/2014},
	abstract = {The concept of reproducibility is widely considered a cornerstone of scientific methodology. However, recent problems with the reproducibility of empirical results in large-scale systems and in biomedical research have cast doubts on its universal and rigid applicability beyond the so-called basic sciences. Reproducibility is a particularly difficult issue in interdisciplinary work where the results to be reproduced typically refer to different levels of description of the system considered. In such cases, it is mandatory to distinguish between more and less relevant features, attributes or observables of the system, depending on the level at which they are described. For this reason, we propose a scheme for a general {\textquoteleft}relation of relevance{\textquoteright} between the level of complexity at which a system is considered and the granularity of its description. This relation implies relevance criteria for particular selected aspects of a system and its description, which can be operationally implemented by an interlevel relation called {\textquoteleft}contextual emergence{\textquoteright}. It yields a formally sound and empirically applicable procedure to translate between descriptive levels and thus construct level-specific criteria for reproducibility in an overall consistent fashion. Relevance relations merged with contextual emergence challenge the old idea of one fundamental ontology from which everything else derives. At the same time, our proposal is specific enough to resist the backlash into a relativist patchwork of unconnected model fragments.},
	keywords = {complexity, relevance, Reproducibility},
	doi = {10.1098/​rsif.2013.1030},
	url = {http://rsif.royalsocietypublishing.org/content/11/94/20131030},
	author = {H. Atmanspacher and L. Bezzola Lambert and G. Folkers and P. A. Schubiger}
}
@conference {Bachmann:2010:MLB:1882291.1882308,
	title = {The missing links: bugs and bug-fix commits},
	booktitle = {Proceedings of the eighteenth ACM SIGSOFT international symposium on Foundations of software engineering},
	series = {FSE {\textquoteright}10},
	year = {2010},
	pages = {97{\textendash}106},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	abstract = {Empirical studies of software defects rely on links between bug databases and program code repositories. This linkage is typically based on bug-fixes identified in developer-entered commit logs. Unfortunately, developers do not always report which commits perform bug-fixes. Prior work suggests that such links can be a biased sample of the entire population of fixed bugs. The validity of statistical hypotheses-testing based on linked data could well be affected by bias. Given the wide use of linked defect data, it is vital to gauge the nature and extent of the bias, and try to develop testable theories and models of the bias. To do this, we must establish ground truth: manually analyze a complete version history corpus, and nail down those commits that fix defects, and those that do not. This is a diffcult task, requiring an expert to compare versions, analyze changes, find related bugs in the bug database, reverse-engineer missing links, and finally record their work for use later. This effort must be repeated for hundreds of commits to obtain a useful sample of reported and unreported bug-fix commits. We make several contributions. First, we present Linkster, a tool to facilitate link reverse-engineering. Second, we evaluate this tool, engaging a core developer of the Apache HTTP web server project to exhaustively annotate 493 commits that occurred during a six week period. Finally, we analyze this comprehensive data set, showing that there are serious and consequential problems in the data.},
	keywords = {apache, bias, case study, manual annotation, tool},
	isbn = {978-1-60558-791-2},
	doi = {10.1145/1882291.1882308},
	url = {http://doi.acm.org/10.1145/1882291.1882308},
	author = {Bachmann, Adrian and Bird, Christian and Rahman, Foyzur and Devanbu, Premkumar and Bernstein, Abraham}
}
@conference {Bailey:2009:MPC:1629911.1630049,
	title = {Misleading performance claims in parallel computations},
	booktitle = {Proceedings of the 46th Annual Design Automation Conference},
	series = {DAC {\textquoteright}09},
	year = {2009},
	pages = {528{\textendash}533},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	abstract = {In a previous humorous note entitled "Twelve Ways to Fool the Masses ...," I outlined twelve common ways in which performance figures for technical computer systems can be distorted. In this paper and accompanying conference talk, I give a reprise of these twelve "methods" and give some actual examples that have appeared in peer-reviewed literature in years past. I then propose guidelines for reporting performance, the adoption of which would raise the level of professionalism and reduce the level of confusion, not only in the world of device simulation but also in the larger arena of technical computing.},
	keywords = {parallel computing},
	isbn = {978-1-60558-497-3},
	doi = {10.1145/1629911.1630049},
	url = {http://doi.acm.org/10.1145/1629911.1630049},
	author = {Bailey, David H.}
}
@article {Bailey:1992:MPR:1402573.1402578,
	title = {Misleading Performance Reporting in the Supercomputing Field},
	journal = {Sci. Program.},
	volume = {1},
	number = {2},
	year = {1992},
	month = {04/1992},
	pages = {141{\textendash}151},
	publisher = {IOS Press},
	address = {Amsterdam, The Netherlands, The Netherlands},
	abstract = {In a previous humorous note, I outlined 12 ways in which performance figures for scientific supercomputers can be distorted. In this paper, the problem of potentially misleading performance reporting is discussed in detail. Included are some examples that have appeared in recent published scientific papers. This paper also includes some proposed guidelines for reporting performance, the adoption of which would raise the level of professionalism and reduce the level of confusion in the field of supercomputing.},
	issn = {1058-9244},
	url = {http://dl.acm.org/citation.cfm?id=1402573.1402578},
	author = {Bailey, David H.}
}
@article {300,
	title = {Twelve ways to fool the masses when giving performance results on parallel computers},
	journal = {Supercomputing Review},
	year = {1991},
	month = {08/1991},
	pages = {54--55},
	chapter = {54},
	abstract = {Many of us in the field of highly parallel scientific computing recognize that it is often quite difficult to match the run time performance of the best conventional supercomputers.  This humorous article outlines twelve ways commonly used in scientific papers and presentations to artificially boost performance rates and to present these results in the {\textquotedblleft}best possible light{\textquotedblright} compared to other systems.},
	url = {http://crd-legacy.lbl.gov/~dhbailey/dhbpapers/twelve-ways.pdf},
	author = {Bailey, David H.}
}
@article {304,
	title = {Independent labs to verify high-profile papers},
	journal = {Nature | News},
	year = {2012},
	month = {14 August 2012},
	abstract = {Reproducibility Initiative aims to speed up preclinical research.},
	doi = {doi:10.1038/nature.2012.11176},
	url = {http://www.nature.com/news/independent-labs-to-verify-high-profile-papers-1.11176},
	author = {Monya Baker}
}
@conference {Basili:1996:RES:227726.227818,
	title = {The role of experimentation in software engineering: past, current, and future},
	booktitle = {Proceedings of the 18th international conference on Software engineering},
	series = {ICSE {\textquoteright}96},
	year = {1996},
	pages = {442{\textendash}449},
	publisher = {IEEE Computer Society},
	organization = {IEEE Computer Society},
	address = {Washington, DC, USA},
	abstract = {Software engineering needs to follow the model of other physical sciences and develop an experimental paradigm for the field. This paper proposes the approach towards developing an experimental component of such a paradigm. The approach is based upon a quality improvement paradigm that addresses the role of experimentation and process improvement in the context of industrial development. The paper outlines a classification scheme for characterizing such experiments.},
	isbn = {0-8186-7246-3},
	url = {http://portal.acm.org/citation.cfm?id=227726.227818},
	author = {Basili, Victor R.}
}
@inbook {292,
	title = {The Goal Question Metric Approach},
	booktitle = {Encyclopedia of Software Engineering},
	year = {1994},
	publisher = {Wiley},
	organization = {Wiley},
	url = {ftp://ftp.cs.umd.edu/pub/sel/papers/gqm.pdf},
	author = {Basili, Victor R. and Gianluigi Caldiera and H. Dieter Rombach}
}
@article {303,
	title = {More trial, less error - An effort to improve scientific studies},
	journal = {Reuters},
	year = {2012},
	month = {08/2012},
	abstract = {So many scientific studies are making incorrect claims that a new service has sprung up to fact-check reported findings by repeating the experiments.},
	url = {http://www.reuters.com/article/2012/08/14/us-science-replication-service-idUSBRE87D0I820120814},
	author = {Sharon Begley}
}
@article {356,
	title = {An introduction to the study of experimental medicine},
	journal = {Journal of the American Pharmaceutical Association},
	volume = {39},
	year = {1950},
	month = {10/1950},
	chapter = {597},
	abstract = {Contents:

Part 1: Experimental Reasoning

Chapter 1: Observation and Experiment
i. Various definitions of observation and experiment
ii. Gaining experience and relying on observation is different from making experiments and making observations
iii. The investigator; scientific research
iv. Observers and experimenters; the sciences of observation and of experiment
v. Experiment is fundamentally only induced observation
vi. In experimental reasoning, experimenters are not separate from observers

Chapter 2: The A Priori Idea and Doubt in Experimental Reasoning
i. Experimental truths are objective or external
ii. Intuition or feeling begets the experimental idea
iii. Experimenters must doubt, avoid fixed ideas, and always keep their freedom of mind
iv. The independent character of the experimental method
v. Induction and deduction in experimental reasoning
vi. Doubt in experimental reasoning
vii. The principle of the experimental criterion
viii. Proof and counterproof

Part 2: Experimentation with Living Beings

Part 3: Applications of the Experimental Method to the Study of Vital Phenomena},
	doi = {10.1002/jps.3030391022},
	url = {http://www.gutenberg.org/ebooks/16234},
	author = {Claude Bernard}
}
@book {345,
	title = {The Art of Scientific Investigation},
	year = {1957},
	publisher = {Norton},
	organization = {Norton},
	address = {New York, NY, USA},
	url = {http://archive.org/details/artofscientifici00beve},
	author = {Beveridge, William Ian Beardmore}
}
@conference {Bird:2009:FBB:1595696.1595716,
	title = {Fair and balanced?: bias in bug-fix datasets},
	booktitle = {Proceedings of the the 7th joint meeting of the European software engineering conference and the ACM SIGSOFT symposium on The foundations of software engineering},
	series = {ESEC/FSE {\textquoteright}09},
	year = {2009},
	pages = {121{\textendash}130},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	abstract = {Software engineering researchers have long been interested in where and why bugs occur in code, and in predicting where they might turn up next. Historical bug-occurence data has been key to this research. Bug tracking systems, and code version histories, record when, how and by whom bugs were fixed; from these sources, datasets that relate file changes to bug fixes can be extracted. These historical datasets can be used to test hypotheses concerning processes of bug introduction, and also to build statistical bug prediction models. Unfortunately, processes and humans are imperfect, and only a fraction of bug fixes are actually labelled in source code version histories, and thus become available for study in the extracted datasets. The question naturally arises, are the bug fixes recorded in these historical datasets a fair representation of the full population of bug fixes? In this paper, we investigate historical data from several software projects, and find strong evidence of systematic bias. We then investigate the potential effects of "unfair, imbalanced" datasets on the performance of prediction techniques. We draw the lesson that bias is a critical problem that threatens both the effectiveness of processes that rely on biased datasets to build prediction models and the generalizability of hypotheses tested on biased data.},
	keywords = {bias},
	isbn = {978-1-60558-001-2},
	doi = {10.1145/1595696.1595716},
	url = {http://doi.acm.org/10.1145/1595696.1595716},
	author = {Bird, Christian and Bachmann, Adrian and Aune, Eirik and Duffy, John and Bernstein, Abraham and Filkov, Vladimir and Devanbu, Premkumar}
}
@article {Blackburn:2016:TWT:2982214.2983574,
	title = {The Truth, The Whole Truth, and Nothing But the Truth: A Pragmatic Guide to Assessing Empirical Evaluations},
	journal = {ACM Trans. Program. Lang. Syst.},
	volume = {38},
	number = {4},
	year = {2016},
	month = {10/2016},
	pages = {15:1{\textendash}15:20},
	publisher = {ACM},
	address = {New York, NY, USA},
	abstract = {An unsound claim can misdirect a field, encouraging the pursuit of unworthy ideas and the abandonment of promising ideas. An inadequate description of a claim can make it difficult to reason about the claim, for example, to determine whether the claim is sound. Many practitioners will acknowledge the threat of unsound claims or inadequate descriptions of claims to their field. We believe that this situation is exacerbated, and even encouraged, by the lack of a systematic approach to exploring, exposing, and addressing the source of unsound claims and poor exposition.

This article proposes a framework that identifies three sins of reasoning that lead to unsound claims and two sins of exposition that lead to poorly described claims and evaluations. Sins of exposition obfuscate the objective of determining whether or not a claim is sound, while sins of reasoning lead directly to unsound claims.

Our framework provides practitioners with a principled way of critiquing the integrity of their own work and the work of others. We hope that this will help individuals conduct better science and encourage a cultural shift in our research community to identify and promulgate sound claims.},
	keywords = {Experimental evaluation, Experimentation, observation study},
	issn = {0164-0925},
	doi = {10.1145/2983574},
	url = {http://doi.acm.org/10.1145/2983574},
	author = {Blackburn, Stephen M. and Diwan, Amer and Hauswirth, Matthias and Sweeney, Peter F. and Amaral, Jos{\'e} Nelson and Brecht, Tim and Bulej, Lubom\'{\i}r and Click, Cliff and Eeckhout, Lieven and Fischmeister, Sebastian and Frampton, Daniel and Hendren, Laurie J. and Hind, Michael and Hosking, Antony L. and Jones, Richard E. and Kalibera, Tomas and Keynes, Nathan and Nystrom, Nathaniel and Andreas Zeller}
}
@article {Blackburn:2008:WUS:1378704.1378723,
	title = {Wake Up and Smell the Coffee: Evaluation Methodology for the 21st Century},
	journal = {Commun. ACM},
	volume = {51},
	number = {8},
	year = {2008},
	pages = {83{\textendash}89},
	publisher = {ACM},
	address = {New York, NY, USA},
	issn = {0001-0782},
	doi = {10.1145/1378704.1378723},
	url = {http://doi.acm.org/10.1145/1378704.1378723},
	author = {Blackburn, Stephen M. and McKinley, Kathryn S. and Garner, Robin and Hoffmann, Chris and Khan, Asjad M. and Bentzur, Rotem and Diwan, Amer and Feinberg, Daniel and Frampton, Daniel and Guyer, Samuel Z. and Hirzel, Martin and Hosking, Antony and Jump, Maria and Lee, Han and Moss, J. Eliot B. and Phansalkar, Aashish and Stefanovik, Darko and VanDrunen, Thomas and von Dincklage, Daniel and Wiedermann, Ben}
}
@conference {1167488,
	title = {The DaCapo benchmarks: java benchmarking development and analysis},
	booktitle = {OOPSLA {\textquoteright}06: Proceedings of the 21st annual ACM SIGPLAN conference on Object-oriented programming systems, languages, and applications},
	year = {2006},
	pages = {169{\textendash}190},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	abstract = { Since benchmarks drive computer science research and industry product development, which ones we use and how we evaluate them are key questions for the community. Despite complex runtime tradeoffs due to dynamic compilation and garbage collection required for Java programs, many evaluations still use methodologies developed for C, C++, and Fortran. SPEC, the dominant purveyor of benchmarks, compounded this problem by institutionalizing these methodologies for their Java benchmark suite. This paper recommends benchmarking selection and evaluation methodologies, and introduces the DaCapo benchmarks, a set of open source, client-side Java benchmarks. We demonstrate that the complex interactions of (1) architecture, (2) compiler, (3) virtual machine, (4) memory management, and (5) application require more extensive evaluation than C, C++, and Fortran which stress (4) much less, and do not require (3). We use and introduce new value, time-series, and statistical metrics for static and dynamic properties such as code complexity, code size, heap composition, and pointer mutations. No benchmark suite is definitive, but these metrics show that DaCapo improves over SPEC Java in a variety of ways, including more complex code, richer object behaviors, and more demanding memory system requirements. This paper takes a step towards improving methodologies for choosing and evaluating benchmarks to foster innovation in system design and implementation for Java and other managed languages.
},
	keywords = {benchmark, DaCapo, Java, methodology, SPEC},
	isbn = {1-59593-348-4},
	doi = {http://doi.acm.org/10.1145/1167473.1167488},
	author = {Blackburn, Stephen M. and Garner, Robin and Hoffmann, Chris and Khang, Asjad M. and McKinley, Kathryn S. and Bentzur, Rotem and Diwan, Amer and Feinberg, Daniel and Frampton, Daniel and Guyer, Samuel Z. and Hirzel, Martin and Hosking, Antony and Jump, Maria and Lee, Han and Moss, J. Eliot B. and Moss, B. and Phansalkar, Aashish and Stefanovi{\'c}, Darko and VanDrunen, Thomas and von Dincklage, Daniel and Wiedermann, Ben}
}
@article {Bonnet:2011:RWE:2034863.2034873,
	title = {Repeatability and workability evaluation of SIGMOD 2011},
	journal = {SIGMOD Rec.},
	volume = {40},
	year = {2011},
	pages = {45{\textendash}48},
	publisher = {ACM},
	address = {New York, NY, USA},
	abstract = {SIGMOD has offered, since 2008, to verify the experiments published in the papers accepted at the conference. This year, we have been in charge of reproducing the experiments provided by the authors (repeatability), and exploring changes to experiment parameters (workability). In this paper, we assess the SIGMOD repeatability process in terms of participation, review process and results. While the participation is stable in terms of number of submissions, we find this year a sharp contrast between the high participation from Asian authors and the low participation from American authors. We also find that most experiments are distributed as Linux packages accompanied by instructions on how to setup and run the experiments. We are still far from the vision of executable papers.},
	issn = {0163-5808},
	doi = {http://doi.acm.org/10.1145/2034863.2034873},
	url = {http://doi.acm.org/10.1145/2034863.2034873},
	author = {Bonnet, Philippe and Manegold, Stefan and Bj{\o}rling, Matias and Cao, Wei and Gonzalez, Javier and Granados, Joel and Hall, Nancy and Idreos, Stratos and Ivanova, Milena and Johnson, Ryan and Koop, David and Kraska, Tim and M{\"u}ller, Ren{\'e} and Olteanu, Dan and Papotti, Paolo and Reilly, Christine and Tsirogiannis, Dimitris and Yu, Cong and Freire, Juliana and Shasha, Dennis}
}
@conference {192,
	title = {Benefits and Barriers of User Evaluation in Software Engineering Research},
	booktitle = {OOPSLA {\textquoteright}11: Proceedings of the ACM international conference on Object oriented programming systems languages and applications},
	year = {2011},
	month = {10/2011},
	publisher = {ACM},
	organization = {ACM},
	address = {Portland, Oregon, USA},
	abstract = {In this paper, we identify trends about, benefits from, and barriers to performing user evaluations in software engineering research. From a corpus of over 3,000 papers spanning ten years, we report on various subtypes of user evaluations (e.g., coding tasks vs. questionnaires) and relate user evaluations to paper topics (e.g., debugging vs. technology transfer). We identify the external measures of impact, such as best paper awards and citation counts, that are correlated with the presence of user evaluations. We complement this with a survey of over 100 researchers from over 40 different universities and labs in which we identify a set of perceived barriers to performing user evaluations. },
	keywords = {Human study, User evaluation},
	url = {http://www.cs.virginia.edu/~weimer/p/oopsla052-buse.pdf},
	author = {Raymond P.L. Buse and Caitlin Sadowski and Westley Weimer}
}
@article {childers_et_al:DR:2016:5762,
	title = {Artifact Evaluation for Publications (Dagstuhl Perspectives Workshop 15452)},
	journal = {Dagstuhl Reports},
	volume = {5},
	number = {11},
	year = {2016},
	pages = {29{\textendash}35},
	publisher = {Schloss Dagstuhl{\textendash}Leibniz-Zentrum fuer Informatik},
	address = {Dagstuhl, Germany},
	abstract = {This report documents the program and the outcomes of Dagstuhl Perspectives Workshop 15452 "Artifact Evaluation for Publications". This Perspectives Workshop conveyed several stakeholders in artifact evaluation from different communities to assess how artifact evaluation is working and make recommendations to the computer systems research community about several issues with the process.},
	issn = {2192-5283},
	doi = {http://dx.doi.org/10.4230/DagRep.5.11.29},
	url = {http://drops.dagstuhl.de/opus/volltexte/2016/5762},
	author = {Bruce R. Childers and Grigori Fursin and Shriram Krishnamurthi and Andreas Zeller},
	editor = {Bruce R. Childers and Grigori Fursin and Shriram Krishnamurthi and Andreas Zeller}
}
@conference {Clark:2004:XAR:1247415.1247462,
	title = {Xen and the art of repeated research},
	booktitle = {Proceedings of the annual conference on USENIX Annual Technical Conference},
	series = {ATEC {\textquoteright}04},
	year = {2004},
	pages = {47{\textendash}47},
	publisher = {USENIX Association},
	organization = {USENIX Association},
	address = {Berkeley, CA, USA},
	abstract = {Xen is an x86 virtual machine monitor produced by the University of Cambridge Computer Laboratory and released under the GNU General Public License. Performance results comparing XenoLinux (Linux running in a Xen virtual machine) to native Linux as well as to other virtualization tools such as User Mode Linux (UML) were recently published in the paper "Xen and the Art of Virtualization" at the Symposium on Operating Systems Principles (October 2003). In this study, we repeat this performance analysis of Xen. We also extend the analysis in several ways, including comparing XenoLinux on x86 to an IBM zServer. We use this study as an example of repeated research. We argue that this model of research, which is enabled by open source software, is an important step in transferring the results of computer science research into production environments.},
	doi = {http://portal.acm.org/citation.cfm?id=1247415.1247462},
	url = {http://www.usenix.org/event/usenix04/tech/freenix/clark.html},
	author = {Clark, Bryan and Deshane, Todd and Dow, Eli and Evanchik, Stephen and Finlayson, Matthew and Herne, Jason and Matthews, Jeanna Neefe}
}
@book {79,
	title = {Academic Careers for Experimental Computer Scientists and Engineers},
	year = {1994},
	pages = {152},
	publisher = {The National Academies Press},
	organization = {The National Academies Press},
	address = {Washington, D.C.},
	isbn = {0-309-04931-8},
	url = {http://www.nap.edu/catalog.php?record_id=2236}
}
@article {Computer Science and Telecommunications Board:1994:ACE:175276.175285,
	title = {Academic careers for experimental computer scientists and engineers},
	journal = {Commun. ACM},
	volume = {37},
	year = {1994},
	month = {April},
	pages = {87{\textendash}90},
	publisher = {ACM},
	address = {New York, NY, USA},
	abstract = {A new report from the Computer Science and Telecommunications Board focuses on the career tracks and knotty issues faced by {\textquotedblleft}experimental{\textquotedblright} faculty members.
},
	issn = {0001-0782},
	doi = {http://doi.acm.org/10.1145/175276.175285},
	url = {http://doi.acm.org/10.1145/175276.175285}
}
@conference {Curtsinger:2013:SSS:2451116.2451141,
	title = {STABILIZER: Statistically Sound Performance Evaluation},
	booktitle = {Proceedings of the Eighteenth International Conference on Architectural Support for Programming Languages and Operating Systems},
	series = {ASPLOS {\textquoteright}13},
	year = {2013},
	month = {03/2013},
	pages = {219{\textendash}228},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	abstract = {Researchers and software developers require effective performance evaluation. Researchers must evaluate optimizations or measure overhead. Software developers use automatic performance regression tests to discover when changes improve or degrade performance. The standard methodology is to compare execution times before and after applying changes.

Unfortunately, modern architectural features make this approach unsound. Statistically sound evaluation requires multiple samples to test whether one can or cannot (with high confidence) reject the null hypothesis that results are the same before and after. However, caches and branch predictors make performance dependent on machine-specific parameters and the exact layout of code, stack frames, and heap objects. A single binary constitutes just one sample from the space of program layouts, regardless of the number of runs. Since compiler optimizations and code changes also alter layout, it is currently impossible to distinguish the impact of an optimization from that of its layout effects.

This paper presents Stabilizer, a system that enables the use of the powerful statistical techniques required for sound performance evaluation on modern architectures. Stabilizer forces executions to sample the space of memory configurations by repeatedly re-randomizing layouts of code, stack, and heap objects at runtime. Stabilizer thus makes it possible to control for layout effects. Re-randomization also ensures that layout effects follow a Gaussian distribution, enabling the use of statistical tests like ANOVA. We demonstrate Stabilizer{\textquoteright}s efficiency (<7\% median overhead) and its effectiveness by evaluating the impact of LLVM{\textquoteright}s optimizations on the SPEC CPU2006 benchmark suite. We find that, while -O2 has a significant impact relative to -O1, the performance impact of -O3 over -O2 optimizations is indistinguishable from random noise.},
	keywords = {measurement bias, performance evaluation, randomization},
	isbn = {978-1-4503-1870-9},
	doi = {10.1145/2451116.2451141},
	url = {http://doi.acm.org/10.1145/2451116.2451141},
	author = {Curtsinger, Charlie and Berger, Emery D.}
}
@conference {Curtsinger:2015:COF:2815400.2815409,
	title = {Coz: Finding Code That Counts with Causal Profiling},
	booktitle = {Proceedings of the 25th Symposium on Operating Systems Principles},
	series = {SOSP {\textquoteright}15},
	year = {2015},
	month = {10/2015},
	pages = {184{\textendash}197},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	abstract = {Improving performance is a central concern for software developers. To locate optimization opportunities, developers rely on software profilers. However, these profilers only report where programs spent their time: optimizing that code may have no impact on performance. Past profilers thus both waste developer time and make it difficult for them to uncover significant optimization opportunities.

This paper introduces causal profiling. Unlike past profiling approaches, causal profiling indicates exactly where programmers should focus their optimization efforts, and quantifies their potential impact. Causal profiling works by running performance experiments during program execution. Each experiment calculates the impact of any potential optimization by virtually speeding up code: inserting pauses that slow down all other code running concurrently. The key insight is that this slowdown has the same relative effect as running that line faster, thus "virtually" speeding it up.

We present Coz, a causal profiler, which we evaluate on a range of highly-tuned applications: Memcached, SQLite, and the PARSEC benchmark suite. Coz identifies previously unknown optimization opportunities that are both significant and targeted. Guided by Coz, we improve the performance of Memcached by 9\%, SQLite by 25\%, and accelerate six PARSEC applications by as much as 68\%; in most cases, these optimizations involve modifying under 10 lines of code.},
	isbn = {978-1-4503-3834-9},
	doi = {10.1145/2815400.2815409},
	url = {http://doi.acm.org/10.1145/2815400.2815409},
	author = {Curtsinger, Charlie and Berger, Emery D.}
}
@book {306,
	title = {Evaluation Methodology Basics: The Nuts and Bolts of Sound Evaluation},
	year = {2005},
	pages = {280},
	publisher = {Sage},
	organization = {Sage},
	abstract = {Evaluation theorists for years have advised evaluators to {\textquotedblleft}take into account{\textquotedblright} all relevant values as part of an evaluation. But especially for the relatively new evaluator (even one who is knowledgeable and experienced in research methodology), there is not a lot of guidance about how this is done. Evaluation Methodology Basics: The Nuts and Bolts of Sound Evaluation provides a step-by-step guide for doing a real evaluation. It focuses on the main kinds of {\textquotedblleft}big picture{\textquotedblright} questions that evaluators usually need to answer, and how the nature of such questions is linked to evaluation methodology choices. Jane Davidson explains how to combine a mix of qualitative and quantitative data with {\textquotedblleft}relevant values{\textquotedblright} (such as needs) to draw explicitly evaluative conclusions.

Many students and evaluators find it difficult to visualize what evaluation logic and methodology {\textquotedblleft}look like{\textquotedblright} in practice. To address this, Davidson presents readers with useful rubrics and flowcharts that may be used during each stage of the evaluation. Many of the concepts presented in the chapters are illustrated with specific examples from a range of disciplines. Exercises and {\textquotedblleft}pop quiz{\textquotedblright} questions help reinforce the key points covered in each chapter, provide homework assignments for those teaching an evaluation course, and allow learners to develop slices of an evaluation plan as they work their way through the text.

Evaluation Methodology Basics is an ideal text for students of evaluation and students in programs that have evaluation course requirements, such as education, health, sociology, psychology, and many others throughout the social sciences. It will also be essential reading for practitioners who find themselves thrown into evaluation roles without the benefit of specialized evaluation training.},
	isbn = {9780761929307},
	url = {http://www.sagepub.com/books/Book226129},
	author = {E. Jane Davidson}
}
@article {delling_et_al:DR:2016:6146,
	title = {Rethinking Experimental Methods in Computing (Dagstuhl Seminar 16111)},
	journal = {Dagstuhl Reports},
	volume = {6},
	number = {3},
	year = {2016},
	month = {01/2016},
	pages = {24{\textendash}43},
	publisher = {Schloss Dagstuhl{\textendash}Leibniz-Zentrum fuer Informatik},
	address = {Dagstuhl, Germany},
	abstract = {This report documents the talks and discussions at the Dagstuhl seminar 16111 "Rethinking Experimental Methods in Computing". The seminar brought together researchers from several computer science communities, including algorithm engineering, programming languages, information retrieval, high-performance computing, operations research, performance analysis, embedded systems, distributed systems, and software engineering. The main goals of the seminar were building a network of experimentalists and fostering a culture of sound quantitative experiments in computing. During the seminar, groups of participants have worked on distilling useful resources based on the collective experience gained in different communities and on planning actions to promote sound experimental methods and reproducibility efforts.},
	keywords = {Algorithms, Benchmarks, Data sets, experiments, repeatability, Reproducibility, Software Artifacts, statistics},
	issn = {2192-5283},
	doi = {10.4230/DagRep.6.3.24},
	url = {http://drops.dagstuhl.de/opus/volltexte/2016/6146},
	author = {Daniel Delling and Camil Demetrescu and David S. Johnson and Vitek, Jan},
	editor = {Daniel Delling and Camil Demetrescu and David S. Johnson and Vitek, Jan}
}
@article {Denning:1981:APL:358790.358791,
	title = {ACM president{\textquoteright}s letter: performance analysis: experimental computer science as its best},
	journal = {Commun. ACM},
	volume = {24},
	year = {1981},
	month = {November},
	pages = {725{\textendash}727},
	publisher = {ACM},
	address = {New York, NY, USA},
	issn = {0001-0782},
	doi = {http://doi.acm.org/10.1145/358790.358791},
	url = {http://doi.acm.org/10.1145/358790.358791},
	author = {Denning, Peter J.}
}
@article {Denning:2005:CSS:1053291.1053309,
	title = {Is Computer Science Science?},
	journal = {Commun. ACM},
	volume = {48},
	number = {4},
	year = {2005},
	pages = {27{\textendash}31},
	publisher = {ACM},
	address = {New York, NY, USA},
	abstract = {Computer science meets every criterion for being a science, but it has a self-inflicted credibility problem.},
	issn = {0001-0782},
	doi = {10.1145/1053291.1053309},
	url = {http://doi.acm.org/10.1145/1053291.1053309},
	author = {Denning, Peter J.}
}
@article {309,
	title = {Paper Bricks: An Alternative to Complete-Story Peer Reviewing},
	journal = {SIGMOD Record},
	volume = {39},
	year = {2010},
	month = {12/2010},
	type = {Open Forum},
	abstract = {The peer review system as used in several computer science communities has several flaws including long review times, overloaded reviewers, as well as fostering of niche topics. These flaws decrease quality, lower impact, slowdown the innovation process, and lead to frustration of authors, readers, and reviewers. In order to fix this, we propose a new peer review system termed "paper bricks". Paper bricks has several advantages over the existing system including shorter publications, better competition for new ideas, as well as an accelerated innovation process. Furthermore, paper bricks may be implemented with minimal change to the existing peer review systems.},
	url = {http://www.sigmod.org/publications/sigmod-record/1012/pdfs/06.forum.dittrich.pdf},
	author = {Dittrich, Jens}
}
@conference {60,
	title = {Replicability in not Reproducibility: Nor is it Good Science},
	booktitle = {The 4th workshop on Evaluation Methods for Machine Learning},
	year = {2009},
	month = {06/2009},
	address = {Montreal, Canada},
	abstract = {At various machine learning conferences, at various times, there have been discussions arising from the inability to replicate the experimental result published in a paper. There seems to be a wide spread view that we need to do something to address this problem, as it is essential to the advancement of our field. The most compelling argument would seem to be that reproducibility of experimental results is the hallmark of science. Therefore, given that most of us regard machine learning as a scientific discipline, being able to replicate experiments is paramount. I want to challenge this view by separating the notion of reproducibility, a generally desirable property, from replicability, its poor cousin. I claim there are important differences between the two. Reproducibility requires changes; replicability avoids them. Although reproducibility is desirable, I contend that the impoverished version, replicability, is one not worth having.},
	keywords = {Reproducibility},
	url = {http://www.site.uottawa.ca/ICML09WS/papers/w2.pdf},
	author = {Drummond, Chris}
}
@booklet {Druschel06fosteringsystems,
	title = {Fostering Systems Research in Europe: A White Paper by EuroSys, the European Professional Society in Systems},
	year = {2006},
	abstract = {The Computer Systems discipline (which encompasses the sub-areas of operating systems, distributed, embedded, real time and pervasive systems) constitutes a central pillar of computer science. Systems research is the scientific study, analysis, modeling and engineering of effective software platforms. Its challenge is to provide dependable, powerful, performant, secure and scalable solutions within an increasingly complex IT environment. As toolsmiths fueled the Industrial Revolution, today Systems researchers lay the foundation for IT services and applications in the Knowledge Era. Healthy research in Systems is therefore essential for the success and continuing innovation of the IT-based industry (be it proprietary or open source) in Europe. 

Europe contributed many early innovations in Systems and continues to produce significant successes; yet it tends to be overshadowed by research in the US. We find several systemic reasons for this, which need to be addressed. Among others: (1) Overall, Europe under-invests in fundamental research in Systems; (2) the structure and culture of academic institutions do not consistently foster excellence at all levels of Systems education and research; (3) In general, European research groups are isolated, and need to network more effectively amongst each other, with their peer groups in the US and other parts of the world, and with the IT industry. If nothing is done, Systems research in Europe will decline, drying up the roots of innovation. This will negatively impact, not only the European IT industry, but beyond it, all sectors that are IT-based, e.g., financial services, government, health care, education, and manufacturers of high-value products such as aircrafts and cars.

The European Systems community has started to address these issues through improved networking and by raising awareness among leaders in the business community, at universities, at funding agencies and among policy makers. This is a good start, but to excel, more is needed: changes by all players are necessary to improve the Systems research landscape. We make specific recommendations, which are detailed and justified in the main body of this paper. Here is a short summary:

Universities: The top priority is to foster excellence at all levels of education and research. For students, we make the following recommendations: establish {\textquotedblleft}Research Masters{\textquotedblright} programmes feeding into a PhD; ease time limits on PhDs; generalise doctoral internships; encourage student exchanges. For faculty, we recommend outside hiring, evaluation involving outside peers, and evaluation metrics adapted to Systems. To compete for the best young talent, institutions should offer competitive working conditions, including stability, responsibility and significant career prospects. In particular, junior faculty should have modest teaching load and receive mentorship, while enjoying the freedom to pursue their own research agenda.

Industry: Our proposals aim to encourage to innovation and technology transfer. The Systems research community and industry need to improve their interaction. Each side needs to better appreciate the other{\textquoteright}s needs and roles; e.g., intellectual property issues and the value of fundamental, risky, long-term research and publication. The European IT industry should offer internships for PhD students and hire more PhDs. 

Funding agencies: Funding agencies should support long-term, focused, risky and fundamental research projects. Systems research and infrastructure investment need to be sustained over sufficiently long periods. Funding decisions need to be based primarily on technical criteria, such as quality and impact; political criteria (such as balance between EU countries) must come second for research projects.

There is currently a window of opportunity for attracting talented researchers to Europe and to establish Europe as the leading location for high-quality, high-impact Systems research. But to take advantage of this opportunity, the issues we raise need to be addressed now.},
	url = {http://www.eurosys.org/whitepaper-2006/},
	author = {Peter Druschel and Rebecca Isaacs and Thomas Gross and Marc Shapiro}
}
@conference {Eide:2007:EWR:1973430.1973446,
	title = {An experimentation workbench for replayable networking research},
	booktitle = {Proceedings of the 4th USENIX conference on Networked systems design \& implementation},
	series = {NSDI{\textquoteright}07},
	year = {2007},
	pages = {16{\textendash}16},
	publisher = {USENIX Association},
	organization = {USENIX Association},
	address = {Berkeley, CA, USA},
	abstract = {The networked and distributed systems research communities have an increasing need for "replayable" research, but our current experimentation resources fall short of satisfying this need. Replayable activities are those that can be re-executed, either as-is or in modified form, yielding new results that can be compared to previous ones. Replayability requires complete records of experiment processes and data, of course, but it also requires facilities that allow those processes to actually be examined, repeated, modified, and reused.

We are now evolving Emulab, our popular network testbed management system, to be the basis of a new experimentation workbench in support of realistic, largescale, replayable research. We have implemented a new model of testbed-based experiments that allows people to move forward and backward through their experimentation processes. Integrated tools help researchers manage their activities (both planned and unplanned), software artifacts, data, and analyses. We present the workbench, describe its implementation, and report how it has been used by early adopters. Our initial case studies highlight both the utility of the current workbench and additional usability challenges that must be addressed.},
	url = {http://dl.acm.org/citation.cfm?id=1973430.1973446},
	author = {Eide, Eric and Stoller, Leigh and Lepreau, Jay}
}
@conference {Eisenberg:2003:CCS:611892.612002,
	title = {Creating a computer science canon: a course of "classic" readings in computer science},
	booktitle = {Proceedings of the 34th SIGCSE technical symposium on Computer science education},
	series = {SIGCSE {\textquoteright}03},
	year = {2003},
	month = {01-2003},
	pages = {336{\textendash}340},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	keywords = {canon, classic readings},
	isbn = {1-58113-648-X},
	doi = {http://doi.acm.org/10.1145/611892.612002},
	url = {http://doi.acm.org/10.1145/611892.612002},
	author = {Eisenberg, Michael}
}
@article {50,
	title = {Experimental Computer Science:  The Need for a Cultural Change},
	year = {2006},
	month = {12/2006},
	type = {White paper},
	address = {Jerusalem, Israel},
	abstract = {The culture of computer science emphasizes novelty and self-containment, leading to a fragmentation where each research project strives to create its own unique world. This approach is quite distinct from experimentation as it is known in other sciences {\textemdash} i.e. based on observations, hypothesis testing, and reproducibility {\textemdash} that is based on a presupposed common world. But there are many cases in which such experimental procedures can lead to interesting research results even in computer science. It is therefore proposed that greater acceptance of such activities would be beneficial and should be fostered.},
	keywords = {Hypothesis Testing, Observation, Reproducibility},
	url = {http://www.cs.huji.ac.il/~feit/papers/exp05.pdf},
	author = {Dror G. Feitelson}
}
@book {Fenton:1998:SMR:580949,
	title = {Software Metrics: A Rigorous and Practical Approach},
	year = {1998},
	publisher = {PWS Publishing Co.},
	organization = {PWS Publishing Co.},
	edition = {2nd},
	address = {Boston, MA, USA},
	isbn = {0534954251},
	url = {http://dl.acm.org/citation.cfm?id=580949},
	author = {Fenton, Norman E. and Pfleeger, Shari Lawrence}
}
@article {131,
	title = {Cargo Cult Science},
	journal = {Engineering and Science},
	volume = {37},
	year = {1974},
	month = {06/1974},
	pages = {10-13},
	type = {Feature Article},
	chapter = {10},
	abstract = {Some remarks on science, pseudoscience, and learning how to not fool yourself. Caltech{\textquoteright}s 1974 commencement address.},
	url = {http://calteches.library.caltech.edu/51/2/CargoCult.htm},
	author = {Richard Feynman}
}
@article {Fleming:1986:LSC:5666.5673,
	title = {How Not to Lie with Statistics: The Correct Way to Summarize Benchmark Results},
	journal = {Commun. ACM},
	volume = {29},
	number = {3},
	year = {1986},
	pages = {218{\textendash}221},
	publisher = {ACM},
	address = {New York, NY, USA},
	abstract = {Using the arithmetic mean to summarize normalized benchmark results leads to mistaken conclusions that can be avoided by using the preferred method: the geometric mean.},
	issn = {0001-0782},
	doi = {10.1145/5666.5673},
	url = {http://doi.acm.org/10.1145/5666.5673},
	author = {Fleming, Philip J. and Wallace, John J.}
}
@conference {Frachtenberg:2005:PPJ:2146214.2146227,
	title = {Pitfalls in parallel job scheduling evaluation},
	booktitle = {Proceedings of the 11th international conference on Job Scheduling Strategies for Parallel Processing},
	series = {JSSPP{\textquoteright}05},
	year = {2005},
	pages = {257{\textendash}282},
	publisher = {Springer-Verlag},
	organization = {Springer-Verlag},
	address = {Berlin, Heidelberg},
	abstract = {There are many choices to make when evaluating the performance of a complex system. In the context of parallel job scheduling, one must decide what workload to use and what measurements to take. These decisions sometimes have subtle implications that are easy to overlook. In this paper we document numerous pitfalls one may fall into, with the hope of providing at least some help in avoiding them. Along the way, we also identify topics that could benefit from additional research.},
	keywords = {dynamic workload, experimental methodology, parallel job scheduling, performance evaluation, simulation, static workload},
	isbn = {3-540-31024-X, 978-3-540-31024-2},
	doi = {10.1007/11605300_13},
	url = {http://www.cs.huji.ac.il/\%7Efeit/parsched/jsspp05/p-05-13.pdf},
	author = {Frachtenberg, Eitan and Dror G. Feitelson}
}
@conference {Georges:2008:JPE:1449764.1449794,
	title = {Java performance evaluation through rigorous replay compilation},
	booktitle = {Proceedings of the 23rd ACM SIGPLAN conference on Object-oriented programming systems languages and applications},
	series = {OOPSLA {\textquoteright}08},
	year = {2008},
	pages = {367{\textendash}384},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	abstract = {A managed runtime environment, such as the Java virtual machine, is non-trivial to benchmark. Java performance is affected in various complex ways by the application and its input, as well as by the virtual machine (JIT optimizer, garbage collector, thread scheduler, etc.). In addition, non-determinism due to timer-based sampling for JIT optimization, thread scheduling, and various system effects further complicate the Java performance benchmarking process.

Replay compilation is a recently introduced Java performance analysis methodology that aims at controlling non-determinism to improve experimental repeatability. The key idea of replay compilation is to control the compilation load during experimentation by inducing a pre-recorded compilation plan at replay time. Replay compilation also enables teasing apart performance effects of the application versus the virtual machine.

This paper argues that in contrast to current practice which uses a single compilation plan at replay time, multiple compilation plans add statistical rigor to the replay compilation methodology. By doing so, replay compilation better accounts for the variability observed in compilation load across compilation plans. In addition, we propose matched-pair comparison for statistical data analysis. Matched-pair comparison considers the performance measurements per compilation plan before and after an innovation of interest as a pair, which enables limiting the number of compilation plans needed for accurate performance analysis compared to statistical analysis assuming unpaired measurements.},
	keywords = {benchmarking, Java, matched-pair comparison, performance evaluation, replay compilation, virtual machine},
	isbn = {978-1-60558-215-3},
	doi = {http://doi.acm.org/10.1145/1449764.1449794},
	url = {http://doi.acm.org/10.1145/1449764.1449794},
	author = {Georges, Andy and Eeckhout, Lieven and Buytaert, Dries}
}
@conference {1297033,
	title = {Statistically rigorous java performance evaluation},
	booktitle = {OOPSLA {\textquoteright}07: Proceedings of the 22nd annual ACM SIGPLAN conference on Object-oriented programming systems and applications},
	year = {2007},
	pages = {57{\textendash}76},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	abstract = {Java performance is far from being trivial to benchmark because it is affected by various factors such as the Java application, its input, the virtual machine, the garbage collector, the heap size, etc. In addition, non-determinism at run-time causes the execution time of a Java program to differ from run to run. There are a number of sources of non-determinism such as Just-In-Time (JIT) compilation and optimization in the virtual machine (VM) driven by timer-based method sampling, thread scheduling, garbage collection, and various.

There exist a wide variety of Java performance evaluation methodologies usedby researchers and benchmarkers. These methodologies differ from each other in a number of ways. Some report average performance over a number of runs of the same experiment; others report the best or second best performance observed; yet others report the worst. Some iterate the benchmark multiple times within a single VM invocation; others consider multiple VM invocations and iterate a single benchmark execution; yet others consider multiple VM invocations and iterate the benchmark multiple times.

This paper shows that prevalent methodologies can be misleading, and can even lead to incorrect conclusions. The reason is that the data analysis is not statistically rigorous. In this paper, we present a survey of existing Java performance evaluation methodologies and discuss the importance of statistically rigorous data analysis for dealing with non-determinism. We advocate approaches to quantify startup as well as steady-state performance, and, in addition, we provide the JavaStats software to automatically obtain performance numbers in a rigorous manner. Although this paper focuses on Java performance evaluation, many of the issues addressed in this paper also apply to other programming languages and systems that build on a managed runtime system.},
	keywords = {benchmarking, data analysis, Java, methodology, statistics},
	isbn = {978-1-59593-786-5},
	doi = {http://doi.acm.org/10.1145/1297027.1297033},
	author = {Georges, Andy and Buytaert, Dries and Eeckhout, Lieven}
}
@conference {Gil:2011:MCS:2095050.2095100,
	title = {A microbenchmark case study and lessons learned},
	booktitle = {Proceedings of the compilation of the co-located workshops on DSM{\textquoteright}11, TMC{\textquoteright}11, AGERE!{\textquoteright}11, AOOPES{\textquoteright}11, NEAT{\textquoteright}11, \&\#38; VMIL{\textquoteright}11},
	series = {SPLASH {\textquoteright}11 Workshops},
	year = {2011},
	pages = {297{\textendash}308},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	abstract = {The extra abstraction layer posed by the virtual machine, the JIT compilation cycles and the asynchronous garbage collection are the main reasons that make the benchmarking of Java code a delicate task. The primary weapon in battling these is replication: "billions and billions of runs", is phrase sometimes used by practitioners. This paper describes a case study, which consumed hundreds of hours of CPU time, and tries to characterize the inconsistencies in the results we encountered.},
	keywords = {benchmark, measurements, steady-state},
	isbn = {978-1-4503-1183-0},
	doi = {10.1145/2095050.2095100},
	url = {http://doi.acm.org/10.1145/2095050.2095100},
	author = {Gil, Joseph Yossi and Lenz, Keren and Shimron, Yuval}
}
@conference {Hanenberg:2010:FHL:1869459.1869536,
	title = {Faith, hope, and love: an essay on software science{\textquoteright}s neglect of human factors},
	booktitle = {Proceedings of the ACM international conference on Object oriented programming systems languages and applications},
	series = {OOPSLA {\textquoteright}10},
	year = {2010},
	month = {10/2010},
	pages = {933{\textendash}946},
	publisher = {ACM},
	organization = {ACM},
	address = {Reno, NV, USA},
	abstract = {Research in the area of programming languages has different facets -- from formal reasoning about new programming language constructs (such as type soundness proofs for new type systems) over inventions of new abstractions, up to performance measurements of virtual machines. A closer look into the underlying research methods reveals a distressing characteristic of programming language research: developers, which are the main audience for new language constructs, are hardly considered in the research process. As a consequence, it is simply not possible to state whether a new construct that requires some kind of interaction with the developer has any positive impact on the construction of software. This paper argues for appropriate research methods in programming language research that rely on studies of developers -- and argues that the introduction of corresponding empirical methods not only requires a new understanding of research but also a different view on how to teach software science to students.},
	keywords = {empirical research, programming language research, research methods, software engineering},
	isbn = {978-1-4503-0203-6},
	doi = {http://doi.acm.org/10.1145/1869459.1869536},
	url = {http://doi.acm.org/10.1145/1869459.1869536},
	author = {Hanenberg, Stefan}
}
@conference {Hoefler:2015:SBP:2807591.2807644,
	title = {Scientific Benchmarking of Parallel Computing Systems: Twelve Ways to Tell the Masses when Reporting Performance Results},
	booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
	series = {SC {\textquoteright}15},
	year = {2015},
	pages = {73:1{\textendash}73:12},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	abstract = {Measuring and reporting performance of parallel computers constitutes the basis for scientific advancement of high-performance computing (HPC). Most scientific reports show performance improvements of new techniques and are thus obliged to ensure reproducibility or at least interpretability. Our investigation of a stratified sample of 120 papers across three top conferences in the field shows that the state of the practice is lacking. For example, it is often unclear if reported improvements are deterministic or observed by chance. In addition to distilling best practices from existing work, we propose statistically sound analysis and reporting techniques and simple guidelines for experimental design in parallel computing and codify them in a portable benchmarking library. We aim to improve the standards of reporting research results and initiate a discussion in the HPC field. A wide adoption of our minimal set of rules will lead to better interpretability of performance results and improve the scientific culture in HPC.},
	keywords = {benchmarking, data analysis, parallel computing, statistics},
	isbn = {978-1-4503-3723-6},
	doi = {10.1145/2807591.2807644},
	url = {http://doi.acm.org/10.1145/2807591.2807644},
	author = {Hoefler, Torsten and Belli, Roberto}
}
@conference {Host:2005:ECC:1062455.1062539,
	title = {Experimental context classification: incentives and experience of subjects},
	booktitle = {Proceedings of the 27th international conference on Software engineering},
	series = {ICSE {\textquoteright}05},
	year = {2005},
	pages = {470{\textendash}478},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	abstract = {There is a need to identify factors that affect the result of empirical studies in software engineering research. It is still the case that seemingly identical replications of controlled experiments result in different conclusions due to the fact that all factors describing the experiment context are not clearly defined and hence controlled. In this article, a scheme for describing the participants of controlled experiments is proposed and evaluated. It consists of two main factors, the incentives for participants in the experiment and the experience of the participants. The scheme has been evaluated by classifying a set of previously conducted experiments from literature. It can be concluded that the scheme was easy to use and understand. It is also found that experiments that are classified in the same way to a large extent point at the same results, which indicates that the scheme addresses relevant factors.},
	keywords = {Experimentation, subject experience, subject motivation},
	isbn = {1-58113-963-2},
	doi = {http://doi.acm.org/10.1145/1062455.1062539},
	url = {http://doi.acm.org/10.1145/1062455.1062539},
	author = {H{\"o}st, Martin and Wohlin, Claes and Thelin, Thomas}
}
@book {372,
	title = {How to Lie with Statistics},
	year = {1954},
	publisher = {W. W. Norton \& Co.},
	organization = {W. W. Norton \& Co.},
	address = {New York},
	isbn = {0-393-05264-8},
	url = {http://www.horace.org/blog/wp-content/uploads/2012/05/How-to-Lie-With-Statistics-1954-Huff.pdf},
	author = {Darrell Huff}
}
@article {10.1371/journal.pmed.0020124,
	title = {Why Most Published Research Findings Are False},
	journal = {PLoS Medicine},
	volume = {2},
	number = {8},
	year = {2005},
	month = {08/2005},
	pages = {e124},
	publisher = {Public Library of Science},
	abstract = {There is increasing concern that most current published research findings are false. The probability that a research claim is true may depend on study power and bias, the number of other studies on the same question, and, importantly, the ratio of true to no relationships among the relationships probed in each scientific field. In this framework, a research finding is less likely to be true when the studies conducted in a field are smaller; when effect sizes are smaller; when there is a greater number and lesser preselection of tested relationships; where there is greater flexibility in designs, definitions, outcomes, and analytical modes; when there is greater financial and other interest and prejudice; and when more teams are involved in a scientific field in chase of statistical significance. Simulations show that for most study designs and settings, it is more likely for a research claim to be false than true. Moreover, for many current scientific fields, claimed research findings may often be simply accurate measures of the prevailing bias. In this essay, I discuss the implications of these problems for the conduct and interpretation of research.},
	doi = {10.1371/journal.pmed.0020124},
	url = {http://dx.doi.org/10.1371\%2Fjournal.pmed.0020124},
	author = {Ioannidis, John P. A.}
}
@book {Jain01TheArt,
	title = {The Art of Computer Systems Performance Analysis: techniques for experimental design, measurement, simulation, and modeling},
	year = {1991},
	publisher = {Wiley},
	organization = {Wiley},
	keywords = {analysis, analytical, evaluation, experimental, Performance, simulation},
	url = {http://www1.cse.wustl.edu/~jain/books/perfbook.htm},
	author = {Jain, Raj}
}
@inbook {springerlink:10.1007/978-3-642-01156-6_6,
	title = {Richard Hamming - You and Your Research},
	booktitle = {Simula Research Laboratory},
	year = {2010},
	note = {10.1007/978-3-642-01156-6_6},
	pages = {37-60},
	publisher = {Springer Berlin Heidelberg},
	organization = {Springer Berlin Heidelberg},
	abstract = {At a seminar in the Bell Communications Research Colloquia Series, Dr. Richard~W. Hamming, a Professor at the Naval Postgraduate School in Monterey, California and a retired Bell Labs scientist, gave a very interesting and stimulating talk, You and Your Research to an overflow audience of some 200 Bellcore staff members and visitors at the Morris Research and Engineering Center on March 7, 1986. This talk centered on Hamming{\textquoteright}s observations and research on the question Why do so few scientists make significant contributions and so many are forgotten in the long run? From his more than forty years of experience, thirty of which were at Bell Laboratories, he has made a number of direct observations, asked very pointed questions of scientists about what, how, and why they did things, studied the lives of great scientists and great contributions, and has done introspection and studied theories of creativity. The talk is about what he has learned in terms of the properties of the individual scientists, their abilities, traits, working habits, attitudes, and philosophy.},
	isbn = {978-3-642-01156-6},
	url = {http://dx.doi.org/10.1007/978-3-642-01156-6_6},
	author = {Kaiser, J. F.},
	editor = {Tveito, Aslak and Bruaset, Are Magnus and Lysne, Olav}
}
@conference {Kalibera:2013:RBR:2491894.2464160,
	title = {Rigorous Benchmarking in Reasonable Time},
	booktitle = {Proceedings of the 2013 International Symposium on Memory Management},
	series = {ISMM {\textquoteright}13},
	year = {2013},
	month = {06/2013},
	pages = {63{\textendash}74},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	abstract = {Experimental evaluation is key to systems research. Because modern systems are complex and non-deterministic, good experimental methodology demands that researchers account for uncertainty. To obtain valid results, they are expected to run many iterations of benchmarks, invoke virtual machines (VMs) several times, or even rebuild VM or benchmark binaries more than once. All this repetition costs time to complete experiments. Currently, many evaluations give up on sufficient repetition or rigorous statistical methods, or even run benchmarks only in training sizes. The results reported often lack proper variation estimates and, when a small difference between two systems is reported, some are simply unreliable.

In contrast, we provide a statistically rigorous methodology for repetition and summarising results that makes efficient use of experimentation time. Time efficiency comes from two key observations. First, a given benchmark on a given platform is typically prone to much less non-determinism than the common worst-case of published corner-case studies. Second, repetition is most needed where most uncertainty arises (whether between builds, between executions or between iterations). We capture experimentation cost with a novel mathematical model, which we use to identify the number of repetitions at each level of an experiment necessary and sufficient to obtain a given level of precision.

We present our methodology as a cookbook that guides researchers on the number of repetitions they should run to obtain reliable results. We also show how to present results with an effect size confidence interval. As an example, we show how to use our methodology to conduct throughput experiments with the DaCapo and SPEC CPU benchmarks on three recent platforms.},
	keywords = {benchmarking methodology, DaCapo, spec cpu, statistical methods},
	isbn = {978-1-4503-2100-6},
	doi = {10.1145/2464157.2464160},
	url = {http://doi.acm.org/10.1145/2464157.2464160},
	author = {Kalibera, Tomas and Jones, Richard}
}
@article {229,
	title = {Supporting Experimental Computer Science},
	number = {ANL MCS Technical Memo 326},
	year = {2012},
	abstract = {The ability to conduct consistent, controlled,  and  repeatable  large-scale experiments in all areas of computer science related to parallel, large-scale,  or distributed computing and networking is critical to the future and development of computer science. Yet conducting such experiments is still too often a challenge for researchers, students, and practitioners because of the unavailability of dedicated resources, inability to create controlled experimental conditions, and variability in software. Availability, repeatability, and open sharing of electronic products are all still difficult to achieve.

To discuss those challenges and share experiences in their solution, the Workshop on Experimental Support for Computer Science [1] brought together scientists involved in building and operating infrastructures dedicated to supporting  computer science experiments to discuss challenges and solutions in this space.  The workshop was held in November 2011 and was collocated  with  the  SC11  conference  in  Seattle,  Wash. Our objectives were to share experiences and knowledge related to supporting large-scale experiments conducted on experimental infrastructures, understand user requirements, and discuss methodologies and opportunities created by emerging technologies.

This report ties together the workshop presentations and discussion  and  the  consensus  that emerged on the state of the field and directions for moving forward. In Section 2 we set the stage by describing the experimental culture and existing methodology in computer science. In Section 3, we describe the properties of  the  experimental testbeds whose representatives were participating in the workshop{\textemdash}Grid{\textquoteright}5000 in France and Future Grid and Open Cirrus in the United States{\textemdash}as well as the projects that these testbeds support. The layers of experimental infrastructure are described in Section 4, followed in Sections 5 and 6 by profiles of tools and approaches taken by the respective testbeds to provide  basic  experiment  management  services and experiment orchestration. In Section 7 we summarize the workshop findings.},
	url = {http://www.nimbusproject.org/downloads/Supporting_Experimental_Computer_Science_final_draft.pdf},
	author = {Kate Keahey and Fr{\'e}d{\'e}ric Desprez}
}
@article {636197,
	title = {Preliminary guidelines for empirical research in software engineering},
	journal = {IEEE Trans. Softw. Eng.},
	volume = {28},
	number = {8},
	year = {2002},
	month = {08/2002},
	pages = {721{\textendash}734},
	publisher = {IEEE Press},
	address = {Piscataway, NJ, USA},
	abstract = {Empirical software engineering research needs research guidelines to improve the research and reporting processes. We propose a preliminary set of research guidelines aimed at stimulating discussion among software researchers. They are based on a review of research guidelines developed for medical researchers and on our own experience in doing and reviewing software engineering research. The guidelines are intended to assist researchers, reviewers, and meta-analysts in designing, conducting, and evaluating empirical studies. Editorial boards of software engineering journals may wish to use our recommendations as a basis for developing guidelines for reviewers and for framing policies for dealing with the design, data collection, and analysis and reporting of empirical studies. },
	keywords = {empirical software research, research guidelines, statistical mistakes},
	issn = {0098-5589},
	doi = {http://dx.doi.org/10.1109/TSE.2002.1027796},
	author = {Kitchenham, Barbara A. and Pfleeger, Shari Lawrence and Pickard, Lesley M. and Jones, Peter W. and Hoaglin, David C. and Emam, Khaled El and Rosenberg, Jarrett}
}
@article {Ko:2015:PGC:2727049.2727076,
	title = {A Practical Guide to Controlled Experiments of Software Engineering Tools with Human Participants},
	journal = {Empirical Softw. Engg.},
	volume = {20},
	number = {1},
	year = {2015},
	pages = {110{\textendash}141},
	publisher = {Kluwer Academic Publishers},
	address = {Hingham, MA, USA},
	abstract = {Empirical studies, often in the form of controlled experiments, have been widely adopted in software engineering research as a way to evaluate the merits of new software engineering tools. However, controlled experiments involving human participants actually using new tools are still rare, and when they are conducted, some have serious validity concerns. Recent research has also shown that many software engineering researchers view this form of tool evaluation as too risky and too difficult to conduct, as they might ultimately lead to inconclusive or negative results. In this paper, we aim both to help researchers minimize the risks of this form of tool evaluation, and to increase their quality, by offering practical methodological guidance on designing and running controlled experiments with developers. Our guidance fills gaps in the empirical literature by explaining, from a practical perspective, options in the recruitment and selection of human participants, informed consent, experimental procedures, demographic measurements, group assignment, training, the selecting and design of tasks, the measurement of common outcome variables such as success and time on task, and study debriefing. Throughout, we situate this guidance in the results of a new systematic review of the tool evaluations that were published in over 1,700 software engineering papers published from 2001 to 2011.},
	keywords = {experiments, Human participants, Human subjects, Research methodology, Tools},
	issn = {1382-3256},
	doi = {10.1007/s10664-013-9279-3},
	url = {http://dx.doi.org/10.1007/s10664-013-9279-3},
	author = {Ko, Andrew J. and Latoza, Thomas D. and Burnett, Margaret M.}
}
@article {Lea:2008:LPE:1480828.1480848,
	title = {Languages and performance engineering: method, instrumentation, and pedagogy},
	journal = {SIGPLAN Not.},
	volume = {43},
	number = {11},
	year = {2008},
	pages = {87{\textendash}92},
	publisher = {ACM},
	address = {New York, NY, USA},
	abstract = {Programs encounter increasingly complex and fragile mappings to computing platforms, resulting in performance characteristics that are often mysterious to students, practitioners, and even researchers. We discuss some steps toward an experimental methodology that demands and provides a deep understanding of complete systems, the necessary instrumentation and tools to support such a methodology, and a curriculum that teaches the methodology and tools as a fundamental part of the discipline.},
	keywords = {programming languages curriculum},
	issn = {0362-1340},
	doi = {10.1145/1480828.1480848},
	url = {http://doi.acm.org/10.1145/1480828.1480848},
	author = {Lea, Doug and Bacon, David F. and Grove, David}
}
@article {76,
	title = {The Truth Wears Off: Is there something wrong with the scientific method?},
	journal = {The New Yorker},
	year = {2010},
	month = {12/2010},
	pages = {52},
	abstract = {Many results that are rigorously proved and accepted start shrinking in later studies.},
	keywords = {Decline Effect, Jonathan Schooler, Replicability, Scientific Experiments, Scientific Theories, Scientists, statistics},
	url = {http://www.newyorker.com/reporting/2010/12/13/101213fa_fact_lehrer},
	author = {Lehrer, Jonah}
}
@book {citeulike:5314090,
	title = {Measuring Computer Performance: A Practitioner{\textquoteright}s Guide},
	year = {2005},
	publisher = {Cambridge University Press},
	organization = {Cambridge University Press},
	keywords = {benchmarking, Performance},
	isbn = {0521646707},
	url = {http://www.arctic.umn.edu/perf-book/},
	author = {Lilja, David J.}
}
@article {80,
	title = {Report on Workshop on Research in Experimental Computer Science},
	year = {1992},
	month = {06/1992},
	pages = {49},
	abstract = {This report describes a workshop that was concerned with how to improve research in experimental computer science. The overall goal of the workshop was to identity problems and issues in experimental computer science and to propose solutions. The workshop was sponsored by the Office of Naval Research, in coordination with the NSF, DARPA, and other science agencies that participate in the Federal Coordinating Council on Science Engineering and Technology (FCCSET). It was held on October 16-18,1991, in Palo Alto, CA. The workshop consisted of two parts. For the first day and a quarter the entire set of attendees met as a group in an attempt to identify problems and issues that required more detailed discussion. An overview of what happened in these sessions is given in Section 1; Section 1.5 describes some conclusions based on these sessions. The rest of the workshop was spent in small working groups that focused on specific issues. Each of these groups was charged with coming up with a set of proposed solutions to problems in the specific area they were addressing. Section 2 describes what happened in the working groups and the recommendations. Each session (whether attended by the entire group or a subgroup) had a scribe who was responsible for taking notes during that session and providing a written summary, and a session leader who led the discussion and reported on it at the final workshop session. The summaries are included in Appendix A. Information about the program committee and attendees is contained in Appendix B.},
	keywords = {Computers, Engineering, Research Management},
	url = {http://oai.dtic.mil/oai/oai?verb=getRecord\&metadataPrefix=html\&identifier=ADA256874},
	author = {Barbara Liskov}
}
@article {Manegold:2010:RWE:1815933.1815944,
	title = {Repeatability \& workability evaluation of SIGMOD 2009},
	journal = {SIGMOD Rec.},
	volume = {38},
	year = {2010},
	month = {December},
	pages = {40{\textendash}43},
	publisher = {ACM},
	address = {New York, NY, USA},
	abstract = {SIGMOD 2008 was the first database conference that offered to test submitters{\textquoteright} programs against their data to verify the repeatability of the experiments published [1]. Given the positive feedback concerning the SIGMOD 2008 repeatability initiative, SIGMOD 2009 modified and expanded the initiative with a workability assessment.},
	issn = {0163-5808},
	doi = {http://doi.acm.org/10.1145/1815933.1815944},
	url = {http://doi.acm.org/10.1145/1815933.1815944},
	author = {Manegold, S. and Manolescu, I. and Afanasiev, L. and Feng, J. and Gou, G. and Hadjieleftheriou, M. and Harizopoulos, S. and Kalnis, P. and Karanasos, K. and Laurent, D. and Lupu, M. and Onose, N. and R{\'e}, C. and Sans, V. and Senellart, P. and Wu, T. and Shasha, D.}
}
@article {Manolescu:2008:RES:1374780.1374791,
	title = {The repeatability experiment of SIGMOD 2008},
	journal = {SIGMOD Rec.},
	volume = {37},
	year = {2008},
	month = {March},
	pages = {39{\textendash}45},
	publisher = {ACM},
	address = {New York, NY, USA},
	abstract = {SIGMOD 2008 was the first database conference that offered to test submitters{\textquoteright} programs against their data to verify the experiments published. This paper discusses the rationale for this effort, the community{\textquoteright}s reaction, our experiences, and advice for future similar efforts.},
	issn = {0163-5808},
	doi = {http://doi.acm.org/10.1145/1374780.1374791},
	url = {http://doi.acm.org/10.1145/1374780.1374791},
	author = {Manolescu, I. and Afanasiev, L. and Arion, A. and Dittrich, J. and Manegold, S. and Polyzotis, N. and Schnaitter, K. and Senellart, P. and Zoupanos, S. and Shasha, D.}
}
@article {Matthews:2004:CRR:991130.991131,
	title = {The case for repeated research in operating systems},
	journal = {SIGOPS Oper. Syst. Rev.},
	volume = {38},
	year = {2004},
	month = {April},
	pages = {5{\textendash}7},
	publisher = {ACM},
	address = {New York, NY, USA},
	abstract = {Repeated research is a well-respected model of investigation in many sciences. Independent tests of published research are valued because they document the general applicability of results. In addition, repeated research often sheds new light on aspects of a work not fully explored in the original publication and exposes unreported limitations. In computer science, researchers typically report results from testing software that they themselves have implemented. It is natural to wonder why independent tests of published results are so rare. 

[...]

In the sections that follow, I describe a model for repeated research in computer science. I also discuss my experiences using this model in more detail.

[...]},
	issn = {0163-5980},
	doi = {http://doi.acm.org/10.1145/991130.991131},
	url = {http://doi.acm.org/10.1145/991130.991131},
	author = {Matthews, Jeanna Neefe}
}
@article {80,
	title = {Report on Workshop on Research in Experimental Computer Science},
	year = {1992},
	month = {06/1992},
	pages = {49},
	abstract = {This report describes a workshop that was concerned with how to improve research in experimental computer science. The overall goal of the workshop was to identity problems and issues in experimental computer science and to propose solutions. The workshop was sponsored by the Office of Naval Research, in coordination with the NSF, DARPA, and other science agencies that participate in the Federal Coordinating Council on Science Engineering and Technology (FCCSET). It was held on October 16-18,1991, in Palo Alto, CA. The workshop consisted of two parts. For the first day and a quarter the entire set of attendees met as a group in an attempt to identify problems and issues that required more detailed discussion. An overview of what happened in these sessions is given in Section 1; Section 1.5 describes some conclusions based on these sessions. The rest of the workshop was spent in small working groups that focused on specific issues. Each of these groups was charged with coming up with a set of proposed solutions to problems in the specific area they were addressing. Section 2 describes what happened in the working groups and the recommendations. Each session (whether attended by the entire group or a subgroup) had a scribe who was responsible for taking notes during that session and providing a written summary, and a session leader who led the discussion and reported on it at the final workshop session. The summaries are included in Appendix A. Information about the program committee and attendees is contained in Appendix B.},
	keywords = {Computers, Engineering, Research Management},
	url = {http://oai.dtic.mil/oai/oai?verb=getRecord\&metadataPrefix=html\&identifier=ADA256874},
	author = {Barbara Liskov}
}
@article {Mitzenmacher:2015:TWE:2817191.2699413,
	title = {Theory Without Experiments: Have We Gone Too Far?},
	journal = {Commun. ACM},
	volume = {58},
	number = {9},
	year = {2015},
	pages = {40{\textendash}42},
	publisher = {ACM},
	address = {New York, NY, USA},
	abstract = {Seeking a better understanding of computing through a mixture of theory and appropriate experimental evidence.},
	issn = {0001-0782},
	doi = {10.1145/2699413},
	url = {http://doi.acm.org/10.1145/2699413},
	author = {Mitzenmacher, Michael}
}
@article {Mudge:1996:RPL:232790.232791,
	title = {Report on the panel: "how can computer architecture researchers avoid becoming the society for irreproducible results?"},
	journal = {SIGARCH Comput. Archit. News},
	volume = {24},
	year = {1996},
	month = {03/1996},
	pages = {1{\textendash}5},
	publisher = {ACM},
	address = {New York, NY, USA},
	keywords = {Reproducibility},
	issn = {0163-5964},
	doi = {http://doi.acm.org/10.1145/232790.232791},
	url = {http://doi.acm.org/10.1145/232790.232791},
	author = {Mudge, Trevor}
}
@conference {1508275,
	title = {Producing wrong data without doing anything obviously wrong!},
	booktitle = {ASPLOS {\textquoteright}09: Proceeding of the 14th international conference on Architectural support for programming languages and operating systems},
	year = {2009},
	pages = {265{\textendash}276},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	abstract = {This paper presents a surprising result: changing a seemingly innocuous aspect of an experimental setup can cause a systems researcher to draw wrong conclusions from an experiment. What appears to be an innocuous aspect in the experimental setup may in fact introduce a significant bias in an evaluation. This phenomenon is called measurement bias in the natural and social sciences.

Our results demonstrate that measurement bias is significant and commonplace in computer system evaluation. By significant we mean that measurement bias can lead to a performance analysis that either over-states an effect or even yields an incorrect conclusion. By commonplace we mean that measurement bias occurs in all architectures that we tried (Pentium 4, Core 2, and m5 O3CPU), both compilers that we tried (gcc and Intel{\textquoteright}s C compiler), and most of the SPEC CPU2006 C programs. Thus, we cannot ignore measurement bias. Nevertheless, in a literature survey of 133 recent papers from ASPLOS, PACT, PLDI, and CGO, we determined that none of the papers with experimental results adequately consider measurement bias.

Inspired by similar problems and their solutions in other sciences, we describe and demonstrate two methods, one for detecting (causal analysis) and one for avoiding (setup randomization) measurement bias.},
	keywords = {Experimentation, Measurement, Performance},
	isbn = {978-1-60558-406-5},
	doi = {http://doi.acm.org/10.1145/1508244.1508275},
	author = {Mytkowicz, Todd and Diwan, Amer and Hauswirth, Matthias and Sweeney, Peter F.}
}
@conference {1806618,
	title = {Evaluating the accuracy of Java profilers},
	booktitle = {PLDI {\textquoteright}10: Proceedings of the 2010 ACM SIGPLAN conference on Programming language design and implementation},
	year = {2010},
	pages = {187{\textendash}197},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	abstract = {Performance analysts profile their programs to find methods that are worth optimizing: the "hot" methods. This paper shows that four commonly-used Java profilers (xprof , hprof , jprofile, and yourkit) often disagree on the identity of the hot methods. If two profilers disagree, at least one must be incorrect. Thus, there is a good chance that a profiler will mislead a performance analyst into wasting time optimizing a cold method with little or no performance improvement. This paper uses causality analysis to evaluate profilers and to gain insight into the source of their incorrectness. It shows that these profilers all violate a fundamental requirement for sampling-based profilers: to be correct, a sampling-based profiler must collect samples randomly. We show that a proof-of-concept profiler, which collects samples randomly, does not suffer from the above problems. Specifically, we show, using a number of case studies, that our profiler correctly identifies methods that are important to optimize; in some cases other profilers report that these methods are cold and thus not
worth optimizing.},
	keywords = {Experimentation, Performance},
	isbn = {978-1-4503-0019-3},
	doi = {http://doi.acm.org/10.1145/1806596.1806618},
	author = {Mytkowicz, Todd and Diwan, Amer and Hauswirth, Matthias and Sweeney, Peter F.}
}
@article {310,
	title = {Warning Signs in Experimental Design and Interpretation},
	volume = {2012},
	year = {2012},
	abstract = {When an experimental study states "The group with treatment X had significantly less disease (p = 1\%)", many people interpret this statement as being equivalent to "there is a 99\% chance that if I do treatment X it will prevent disease." This essay explains why these statements are not equivalent. For such an experiment, all of the following are possible:

X is in fact an effective treatment as claimed.
X is only effective for some people, but not for me, because I am different in a way that the experiment failed to distinguish.
X is ineffective, and only looked effective due to random chance.
X is ineffective because of a systematic flaw in the experiment.
X is ineffective and the experimenters and/or reader misinterpreted the results to say that it is.
There is no way to know for sure which possibility holds, but there are warning signs that can dilute the credibility of an experiment. In Part I we look at warning signs in the design of an experiment that can render it uninformative; in Part II at warning signs in the interpretation of an experiment that can lead the reader to give it more credibility than it deserves. The presence of any one warning sign does not invalidate a study -- there certainly are valid and convincing studies that are not randomized, for example -- but the more warning signs the more skeptical you should be.},
	url = {http://norvig.com/experiment-design.html},
	author = {Peter Norvig}
}
@article {362,
	title = {Why we should trust scientists},
	year = {2014},
	publisher = {TED},
	type = {TED Talk},
	address = {New York},
	abstract = {Many of the world{\textquoteright}s biggest problems require asking questions of scientists {\textemdash} but why should we believe what they say? Historian of science Naomi Oreskes thinks deeply about our relationship to belief and draws out three problems with common attitudes toward scientific inquiry {\textemdash} and gives her own reasoning for why we ought to trust science.},
	url = {http://www.ted.com/talks/naomi_oreskes_why_we_should_believe_in_science},
	author = {Naomi Oreskes}
}
@conference {Perry:2000:ESS:336512.336586,
	title = {Empirical studies of software engineering: a roadmap},
	booktitle = {Proceedings of the Conference on The Future of Software Engineering},
	series = {ICSE {\textquoteright}00},
	year = {2000},
	pages = {345{\textendash}355},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	abstract = {In this article we summarize the strengths and weaknesses of empirical research in software engineering. We argue that in order to improve the current situation we must create better studies and draw more credible interpretations from them. We finally present a roadmap for this improvement, which includes a general structure for software empirical studies and concrete steps for achieving these goals: designing better studies, collecting data more effectively, and involving others in our empirical enterprises.},
	keywords = {Empirical studies, software engineering},
	isbn = {1-58113-253-0},
	doi = {10.1145/336512.336586},
	url = {http://doi.acm.org/10.1145/336512.336586},
	author = {Perry, Dewayne E. and Porter, Adam A. and Votta, Lawrence G.}
}
@booklet {NIST.TN.1830,
	title = {The ghost in the machine: don{\textquoteright}t let it haunt your software performance measurements},
	number = {NIST.TN.1830},
	year = {2014},
	month = {April},
	publisher = {National Institute of Standards and Technology, US Depertment of Commerce},
	address = {100 Bureau Drive, Gaithersburg, Maryland},
	url = {http://dx.doi.org/10.6028/NIST.TN.1830},
	author = {Vreda Pieterse and David Flater}
}
@article {358,
	title = {The meaning and limits of exact science},
	journal = {Science},
	volume = {110},
	year = {1949},
	note = {Originally published as "Sinn und Grenzen der Exakten Wissenschaft" in 1947, based on a talk held in November 1941},
	month = {09/1949},
	chapter = {319},
	doi = {10.1126/science.110.2857.319},
	url = {http://wilsede-alive.com/wp-content/uploads/files/max\%20planck\%20index\%20opt.pdf},
	author = {Max Planck}
}
@book {346,
	title = {The Logic of Scientific Discovery},
	year = {1959},
	publisher = {Hutchinson \& Co.},
	organization = {Hutchinson \& Co.},
	issn = {0-415-27844-9},
	url = {http://s-f-walker.org.uk/pubsebooks/pdfs/popper-logic-scientific-discovery.pdf},
	author = {Popper, Karl}
}
@article {Prechelt:jucs_3_9:why_we_need_an,
	title = {Why We Need an Explicit Forum for Negative Results},
	journal = {Journal of Universal Computer Science},
	volume = {3},
	number = {9},
	year = {1997},
	pages = {1074{\textendash}1083},
	abstract = {Current Computer Science (CS) research is primarily focused on solving engineering problems. Often though, promising attempts for solving a particular problem fail for non-avoidable reasons. This is what I call a negative result: something that should have worked does not. Due to the current CS publication climate such negative results today are usually camouflaged as positive results by non-evaluating or mis-evaluating the research or by redefining the problem to fit the solution. Such publication behavior hampers progress in CS by suppressing some valuable insights, producing spurious understanding, and misleading further research efforts. Specific examples given below illustrate and back up these claims. This paper is the announcement of a (partial) remedy: a permanent publication forum explicitly for negative CS research results, called the Forum for Negative Results, FNR. FNR will be a regular part of J.UCS.},
	keywords = {failures, FNR, forum, negative results, progress., research culture},
	doi = {10.3217/jucs-003-09-1074},
	url = {http://www.jucs.org/jucs_3_9/why_we_need_an},
	author = {Lutz Prechelt}
}
@article {278,
	title = {Stats: We{\textquoteright}re Doing It Wrong},
	journal = {BLOG@CACM},
	year = {2011},
	publisher = {ACM},
	abstract = {It{\textquoteright}s quite common for HCI or computer science education researchers to use attitude questionnaires to examine people{\textquoteright}s opinions of new software or teaching interventions. These are often on a likert-type scale of strongly agree to strongly disagree. And the sad truth is that researchers typically use the wrong statistical techniques to analyse them. },
	url = {http://cacm.acm.org/blogs/blog-cacm/107125-stats-were-doing-it-wrong/fulltext},
	author = {Judy Robertson}
}
@article {Schulte:Davison:Dye:Dominik:2011:JSSOBK:v46i03,
	title = {A Multi-Language Computing Environment for Literate Programming and Reproducible Research},
	journal = {Journal of Statistical Software},
	volume = {46},
	number = {3},
	year = {2012},
	month = {1},
	pages = {1{\textendash}24},
	abstract = {We present a new computing environment for authoring mixed natural and computer language documents. In this environment a single hierarchically-organized plain text source file may contain a variety of elements such as code in arbitrary programming languages, raw data, links to external resources, project management data, working notes, and text for publication. Code fragments may be executed in situ with graphical, numerical and textual output captured or linked in the file. Export to LATEX, HTML, LATEX beamer, DocBook and other formats permits working reports, presentations and manuscripts for publication to be generated from the file. In addition, functioning pure code files can be automatically extracted from the file. This environment is implemented as an extension to the Emacs text editor and provides a rich set of features for authoring both prose and code, as well as sophisticated project management capabilities.},
	issn = {1548-7660},
	url = {http://www.jstatsoft.org/v46/i03},
	author = {Eric Schulte and Dan Davison and Thomas Dye and Carsten Dominik}
}
@conference {Singer:2011:LEM:2048237.2048249,
	title = {A literate experimentation manifesto},
	booktitle = {Proceedings of the 10th SIGPLAN symposium on New ideas, new paradigms, and reflections on programming and software},
	series = {ONWARD {\textquoteright}11},
	year = {2011},
	pages = {91{\textendash}102},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	abstract = {This paper proposes a new approach to experimental computer systems research, which we call Literate Experimentation. Conventionally, experimental procedure and writeup are divided into distinct phases: i.e. setup (the method), data collection (the results) and analysis (the evaluation of the results). Our concept of a literate experiment is to have a single, rich, human-generated, text-based description of a particular experiment, from which can be automatically derived: (1) a summary of the experimental setup to include in the paper; (2) a sequence of executable commands to setup a computer platform ready to perform the actual experiment; (3) the experiment itself, executed on this appropriately configured platform; and, (4) a means of generating results tables and graphs from the experimental output, ready for inclusion in the paper.

Our Literate Experimentation style has largely been inspired by Knuth{\textquoteright}s Literate Programming philosophy. Effectively, a literate experiment is a small step towards the executable paper panacea. In this work, we argue that a literate experimentation approach makes it easier to produce rigorous experimental evaluation papers. We suggest that such papers are more likely to be accepted for publication, due to (a) the imposed uniformity of structure, and (b) the assurance that experimental results are easily reproducible. We present a case study of a prototype literate experiment involving memory management in Jikes RVM.},
	keywords = {experimental write-up, literate programming},
	isbn = {978-1-4503-0941-7},
	doi = {10.1145/2048237.2048249},
	url = {http://doi.acm.org/10.1145/2048237.2048249},
	author = {Singer, Jeremy}
}
@conference {Sj"berg:2002:CRE:857197.857886,
	title = {Conducting Realistic Experiments in Software Engineering},
	booktitle = {Proceedings of the 2002 International Symposium on Empirical Software Engineering},
	year = {2002},
	pages = {17{\textendash}},
	publisher = {IEEE Computer Society},
	organization = {IEEE Computer Society},
	address = {Washington, DC, USA},
	abstract = {An important goal of most empirical software engineering research is the transfer of research results to industrial applications. Two important obstacles for this transfer are the lack of control of variables of case studies, i.e., the lack of explanatory power, and the lack of realism of controlled experiments. While it may be difficult to increase the explanatory power of case studies, there is a large potential for increasing the realism of controlled software engineering experiments. To convince industry about the validity and applicability of the experimental results, the tasks, subjects and the environments of the experiments should be as realistic as practically possible. Such experiments are, however, more expensive than experiments involving students, small tasks and pen-and-paper environments. Consequently, a change towards more realistic experiments requires a change in the amount of resources spent on software engineering experiments. This paper argues that software engineering researchers should apply for resources enabling expensive and realistic software engineering experiments similar to how other researchers apply for resources for expensive software and hardware that are necessary for their research. The paper describes experiences from recent experiments that varied in size from involving one software professional for 5 days to 130 software professionals, from 9 consultancy companies, for one day each.},
	keywords = {Empirical software engineering, experiments, professionals, technology transfer},
	isbn = {0-7695-1796-X},
	url = {http://dl.acm.org/citation.cfm?id=857197.857886},
	author = {Sj"berg, Dag I. K. and Anda, Bente and Arisholm, Erik and Dyb{\aa}, Tore and J"rgensen, Magne and Karahasanovic, Amela and Koren, Espen F. and Vok{\'a}c, Marek}
}
@article {219,
	title = {Does Systems Research Measure Up?},
	year = {1997},
	institution = {Harvard University},
	abstract = {We surveyed more than two hundred systems research papers published in the last six years, and found that, in experiment after experiment, systems researchers measure the same things, but in the majority of cases the reported results are not reproducible, comparable, or statistically rigorous. In this paper we present data describing the state of systems experimentation and suggest guidelines for structuring commonly run experiments, so that results from work by different researchers can be compared more easily. We conclude with recommendations on how to improve the rigor of published computer systems research.},
	issn = {TR-16-97},
	url = {ftp://ftp.deas.harvard.edu/techreports/tr-16-97.ps.gz},
	author = {Christopher Small and Narendra Ghosh and Hany Saleeb and Margo Seltzer and Keith Smith}
}
@conference {QualitasCorpus:APSEC:2010,
	title = {Qualitas Corpus: A Curated Collection of Java Code for Empirical Studies},
	booktitle = {2010 Asia Pacific Software Engineering Conference (APSEC2010)},
	year = {2010},
	abstract = {In order to increase our ability to use measurement to support software development practise we need to do more analysis of code. However, empirical studies of code are expensive and their results are difficult to compare. We describe the Qualitas Corpus, a large curated collection of open 
source Java systems. The corpus reduces the cost of performing large empirical studies of code and supports comparison of measurements of the same artifacts. We discuss its design, organisation, and issues associated with its development.},
	keywords = {curated code corpus, Empirical studies, experimental infrastructure},
	url = {http://qualitascorpus.com/docs/citation.html},
	author = {Tempero, Ewan and Anslow, Craig and Dietrich, Jens and Han, Ted and Li, Jing and Lumpe, Markus and Melton, Hayden and Noble, James}
}
@article {Tichy:2011:ESR:1998372.1998374,
	title = {Empirical software research: an interview with Dag Sj{\o}berg, University of Oslo, Norway},
	journal = {Ubiquity},
	volume = {2011},
	year = {2011},
	month = {June},
	pages = {2:1{\textendash}2:14},
	publisher = {ACM},
	address = {New York, NY, USA},
	abstract = {Punched cards were already obsolete when I began my studies at the Technical University of Munich in 1971. Instead, we had the luxury of an interactive, line-oriented editor for typing our programs. Doug Engelbart had already invented the mouse, but the device was not yet available. With line editors, users had to identify lines by numbers and type in awkward substitution commands just to add missing semicolons. Though cumbersome by today{\textquoteright}s standards, it was obvious that line-oriented editors were far better than punched cards. Not long after, screen oriented editors such as Vi and Emacs appeared. Again, these editors were obvious improvements and everybody quickly made the switch. No detailed usability studies were needed. "Try it and you{\textquoteright}ll like it" was enough. (Brian Reid at CMU likened screen editors to handing out free cocaine in the schoolyard.) Switching from Assembler to Fortran, Algol, or Pascal also was a no-brainer. But in the late {\textquoteright}70s, the acceptance of new technologies for building software seemed to slow down, even though more people were building software tools. Debates raged over whether Pascal was superior to C, without a clear winner. Object-oriented programming, invented back in the {\textquoteright}60s with Simula, took decades to be widely adopted. Functional programming is languishing to this day. The debate about whether agile methods are better than plan-driven methods has not led to a consensus. Literally hundreds of software development technologies and programming languages have been invented, written about, and demoed over the years, only to be forgotten. What went wrong?},
	issn = {1530-2180},
	doi = {http://doi.acm.org/10.1145/1998372.1998374},
	url = {http://doi.acm.org/10.1145/1998372.1998374},
	author = {Tichy, Walter}
}
@article {620983,
	title = {Should Computer Scientists Experiment More?},
	journal = {Computer},
	volume = {31},
	number = {5},
	year = {1998},
	pages = {32{\textendash}40},
	publisher = {IEEE Computer Society Press},
	address = {Los Alamitos, CA, USA},
	abstract = {Do computer scientists need to experiment at all? Only if the answer is "yes" does it make sense to ask whether there is enough of it. The author argues that experimentation is central to the scientific process. Only experiments test theories. Only experiments can explore critical factors and bring new phenomena to light, so theories can be formulated and corrected. Without experiments, according to the author, computer science is in danger of drying up and becoming an auxiliary discipline. The current pressure to concentrate on application is the writing on the wall. The author rebuts the eight most common objections computer scientists have to focusing on experimentation: The traditional scientific method isn{\textquoteright}t applicable. The current level of experimentation is good enough. Experiments cost too much. Demonstrations will suffice. There{\textquoteright}s too much noise in the way. Progress will slow. Technology changes too fast. You{\textquoteright}ll never get it published.In contrast, the author argues that experimentation would build a reliable base of knowledge and thus reduce uncertainty about which theories, methods, and tools are adequate; lead to new, useful, and unexpected insights and open whole new areas of investigation; and accelerate progress by quickly eliminating fruitless approaches, erroneous assumptions, and fads. Conversely, when we ignore experimentation and avoid contact with reality, we hamper progress. As computer science leaves adolescence behind, the author advocates the development of its experimental branch. },
	keywords = {Experimentation, Theory},
	issn = {0018-9162},
	doi = {http://dx.doi.org/10.1109/2.675631},
	author = {Tichy, Walter F.}
}
@book {Tufte:1986:VDQ:33404,
	title = {The Visual Display of Quantitative Information},
	year = {1986},
	publisher = {Graphics Press},
	organization = {Graphics Press},
	address = {Cheshire, CT, USA},
	isbn = {0-9613921-0-X},
	author = {Tufte, Edward R.}
}
@conference {Vitek:2011:RRR:2038642.2038650,
	title = {Repeatability, reproducibility, and rigor in systems research},
	booktitle = {Proceedings of the ninth ACM international conference on Embedded software},
	series = {EMSOFT {\textquoteright}11},
	year = {2011},
	pages = {33{\textendash}38},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	abstract = {Computer systems research spans sub-disciplines that include embedded and real-time systems, compilers, networking, and operating systems. Our contention is that a number of structural factors inhibit quality research and decrease the velocity of science. We highlight some of the factors we have encountered in our work and observed in published papers and propose solutions that, if widely adopted, could both increase the productivity of researchers and the quality of their output.},
	keywords = {repeatability, Reproducibility, scientific method},
	isbn = {978-1-4503-0714-7},
	doi = {http://doi.acm.org/10.1145/2038642.2038650},
	url = {http://doi.acm.org/10.1145/2038642.2038650},
	author = {Vitek, Jan and Kalibera, Tomas}
}
@conference {Wieringa:2009:WRS:1683309.1684889,
	title = {How to Write and Read a Scientific Evaluation Paper},
	booktitle = {Proceedings of the 2009 17th IEEE International Requirements Engineering Conference, RE},
	series = {RE {\textquoteright}09},
	year = {2009},
	pages = {361{\textendash}364},
	publisher = {IEEE Computer Society},
	organization = {IEEE Computer Society},
	address = {Washington, DC, USA},
	abstract = {Scientific evaluation papers investigate existing problem situations or validate proposed solutions with scientific means, such as by experiment or case study. There is a growing amount of literature about how to report about empirical research in software engineering, but there is still some confusion about the difference between a scientific evaluation paper and other kinds of research papers. This is related to lack of clarity about the relation between empirical research, engineering, and industrial practice. In this minitutorial we give a brief rundown on how to structure a scientific evaluation papers as a special kind of research paper, using experiment reports and case study reports as examples. We give checklists of items that a reader should be able to find in these papers, and sketch the dilemmas that writers and readers of these papers face when applying these checklists.},
	keywords = {Research methodology, Research reporting, Scientific evaluation papers},
	isbn = {978-0-7695-3761-0},
	doi = {http://dx.doi.org/10.1109/RE.2009.17},
	url = {http://dx.doi.org/10.1109/RE.2009.17},
	author = {Wieringa, Roel and Heerkens, Hans and Regnell, Bj{\"o}rn}
}
@article {209,
	title = {Empirical Software Engineering},
	journal = {American Scientist},
	volume = {99},
	year = {2011},
	abstract = {As researchers investigate how software gets made, a new empire for empirical research opens up},
	doi = {http://dx.doi.org/10.1511/2011.93.466},
	url = {http://www.americanscientist.org/issues/num2/2011/6/empirical-software-engineering/1},
	author = {Greg Wilson and Jorge Aranda}
}
@book {344,
	title = {An Introduction to Scientific Research},
	year = {1952},
	publisher = {Dover Publications Inc.},
	organization = {Dover Publications Inc.},
	address = {New York, NY, USA},
	issn = {0-486-66545-3},
	url = {http://www.amazon.com/Introduction-Scientific-Research-Bright-Wilson/dp/0486665453},
	author = {Wilson, E. Bright}
}
@conference {Zannier:2006:SES:1134285.1134333,
	title = {On the success of empirical studies in the international conference on software engineering},
	booktitle = {Proceedings of the 28th international conference on Software engineering},
	series = {ICSE {\textquoteright}06},
	year = {2006},
	pages = {341{\textendash}350},
	publisher = {ACM},
	organization = {ACM},
	address = {New York, NY, USA},
	abstract = {Critiques of the quantity and quality of empirical evaluations in software engineering have existed for quite some time. However such critiques are typically not empirically evaluated. This paper fills this gap by empirically analyzing papers published by ICSE, the prime research conference on Software Engineering. We present quantitative and qualitative results of a quasi-random experiment of empirical evaluations over the lifetime of the conference. Our quantitative results show the quantity of empirical evaluation has increased over 29 ICSE proceedings but we still have room to improve the soundness of empirical evaluations in ICSE proceedings. Our qualitative results point to specific areas of improvement in empirical evaluations.},
	keywords = {empirical evaluation},
	isbn = {1-59593-375-1},
	doi = {http://doi.acm.org/10.1145/1134285.1134333},
	url = {http://doi.acm.org/10.1145/1134285.1134333},
	author = {Zannier, Carmen and Melnik, Grigori and Maurer, Frank}
}