Federico Silla

Contact

Position:: Full Professor

Address:: Valencia
Email:: This email address is being protected from spambots. You need JavaScript enabled to view it.
Phone:: +34963877904
Website:: https://sites.google.com/site/federicosillaupv/home

Image & Curriculum Vitae

Image & Curriculum Vitae

Publications

Carlos Reaño and Federico Silla. A Comparative Performance Analysis of Remote GPU Virtualization over Three Generations of GPUs. In 46th International Conference on Parallel Processing Workshops, ICPP Workshops 2017, Bristol, United Kingdom, August 14-17, 2017. 2017, 121–128. URL, DOI BibTeX

@conference{DBLP:conf/icppw/ReanoS17,
	author = "Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "46th International Conference on Parallel Processing Workshops, ICPP Workshops 2017, Bristol, United Kingdom, August 14-17, 2017",
	crossref = "DBLP:conf/icppw/2017",
	doi = "10.1109/ICPPW.2017.29",
	pages = "121--128",
	title = "{A} {C}omparative {P}erformance {A}nalysis of {R}emote {GPU} {V}irtualization over {T}hree {G}enerations of {GPU}s",
	url = "https://doi.org/10.1109/ICPPW.2017.29",
	year = 2017
}

Carlos Reaño, Federico Silla and Jose Duato. Enhancing the rCUDA Remote GPU Virtualization Framework: from a Prototype to a Production Solution. In Proceedings of the 17th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing, CCGRID 2017, Madrid, Spain, May 14-17, 2017. 2017, 695–698. URL, DOI BibTeX

@conference{DBLP:conf/ccgrid/ReanoSD17,
	author = "Rea{\~n}o, Carlos and Silla, Federico and Duato, Jose",
	booktitle = "Proceedings of the 17th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing, CCGRID 2017, Madrid, Spain, May 14-17, 2017",
	crossref = "DBLP:conf/ccgrid/2017",
	doi = "10.1109/CCGRID.2017.42",
	pages = "695--698",
	title = "{E}nhancing the r{CUDA} {R}emote {GPU} {V}irtualization {F}ramework: from a {P}rototype to a {P}roduction {S}olution",
	url = "https://doi.org/10.1109/CCGRID.2017.42",
	year = 2017
}

Javier Prades, Blesson Varghese, Carlos Reaño and Federico Silla. Multi-tenant virtual GPUs for optimising performance of a financial risk application. J. Parallel Distrib. Comput. 108:28–44, 2017. URL, DOI BibTeX

@article{DBLP:journals/jpdc/PradesVRS17,
	author = "Prades, Javier and Blesson Varghese and Rea{\~n}o, Carlos and Silla, Federico",
	doi = "10.1016/j.jpdc.2016.06.002",
	journal = "J. Parallel Distrib. Comput.",
	pages = "28--44",
	title = "{M}ulti-tenant virtual {GPU}s for optimising performance of a financial risk application",
	url = "https://doi.org/10.1016/j.jpdc.2016.06.002",
	volume = 108,
	year = 2017
}

Federico Silla, Sergio Iserte, Carlos Reaño and Javier Prades. On the benefits of the remote GPU virtualization mechanism: The rCUDA case. Concurrency and Computation: Practice and Experience 29(13), 2017. URL, DOI BibTeX

@article{DBLP:journals/concurrency/SillaIRP17,
	author = "Silla, Federico and Sergio Iserte and Rea{\~n}o, Carlos and Prades, Javier",
	doi = "10.1002/cpe.4072",
	journal = "Concurrency and Computation: Practice and Experience",
	number = 13,
	title = "{O}n the benefits of the remote {GPU} virtualization mechanism: {T}he r{CUDA} case",
	url = "https://doi.org/10.1002/cpe.4072",
	volume = 29,
	year = 2017
}

Javier Prades, Fernando Campos, Carlos Reaño and Federico Silla. GPGPU as a Service: Providing GPU-Acceleration Services to Federated Cloud Systems. In Developing Interoperable and Federated Cloud Architecture. 2016, pages 281–313. URL, DOI BibTeX

@incollection{GPGPUaaS_book_chapter10,
	author = "Prades, Javier and Campos, Fernando and Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "Developing Interoperable and Federated Cloud Architecture",
	crossref = "DBLP:reference/parallel/2011",
	doi = "10.4018/978-1-5225-0153-4.ch010",
	pages = "281--313",
	title = "{GPGPU} as a {S}ervice: {P}roviding {GPU}-{A}cceleration {S}ervices to {F}ederated {C}loud {S}ystems",
	url = "https://doi.org/10.4018/978-1-5225-0153-4.ch010",
	year = 2016
}

Sergio Iserte, Javier Prades, Carlos Reaño and Federico Silla. Increasing the Performance of Data Centers by Combining Remote GPU Virtualization with Slurm. In IEEE/ACM 16th International Symposium on Cluster, Cloud and Grid Computing, CCGrid 2016, Cartagena, Colombia, May 16-19, 2016. 2016, 98–101. URL, DOI BibTeX

@conference{DBLP:conf/ccgrid/IsertePRS16,
	author = "Sergio Iserte and Prades, Javier and Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "IEEE/ACM 16th International Symposium on Cluster, Cloud and Grid Computing, CCGrid 2016, Cartagena, Colombia, May 16-19, 2016",
	crossref = "DBLP:conf/ccgrid/2016",
	doi = "10.1109/CCGrid.2016.26",
	pages = "98--101",
	title = "{I}ncreasing the {P}erformance of {D}ata {C}enters by {C}ombining {R}emote {GPU} {V}irtualization with {S}lurm",
	url = "https://doi.org/10.1109/CCGrid.2016.26",
	year = 2016
}

Ferran Perez, Carlos Reaño and Federico Silla. Providing CUDA Acceleration to KVM Virtual Machines in InfiniBand Clusters with rCUDA. In Distributed Applications and Interoperable Systems - 16th IFIP WG 6.1 International Conference, DAIS 2016, Held as Part of the 11th International Federated Conference on Distributed Computing Techniques, Dis. 2016, 82–95. URL, DOI BibTeX

@conference{DBLP:conf/dais/PerezRS16,
	author = "Perez, Ferran and Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "Distributed Applications and Interoperable Systems - 16th IFIP WG 6.1 International Conference, DAIS 2016, Held as Part of the 11th International Federated Conference on Distributed Computing Techniques, Dis",
	crossref = "DBLP:conf/dais/2016",
	doi = "10.1007/978-3-319-39577-7_7",
	pages = "82--95",
	title = "{P}roviding {CUDA} {A}cceleration to {KVM} {V}irtual {M}achines in {I}nfini{B}and {C}lusters with r{CUDA}",
	url = "https://doi.org/10.1007/978-3-319-39577-7_7",
	year = 2016
}

Federico Silla, Javier Prades, Sergio Iserte and Carlos Reaño. Remote GPU Virtualization: Is It Useful?. In 2nd IEEE International Workshop on High-Performance Interconnection Networks in the Exascale and Big-Data Era HiPINEB@HPCA 2016, Barcelona, Spain, March 12, 2016. 2016, 41–48. URL, DOI BibTeX

@conference{DBLP:conf/hpca/SillaPIR16,
	author = "Silla, Federico and Prades, Javier and Sergio Iserte and Rea{\~n}o, Carlos",
	booktitle = "2nd IEEE International Workshop on High-Performance Interconnection Networks in the Exascale and Big-Data Era HiPINEB@HPCA 2016, Barcelona, Spain, March 12, 2016",
	crossref = "DBLP:conf/hpca/2016hipineb",
	doi = "10.1109/HIPINEB.2016.8",
	pages = "41--48",
	title = "{R}emote {GPU} {V}irtualization: {I}s {I}t {U}seful?",
	url = "https://doi.org/10.1109/HIPINEB.2016.8",
	year = 2016
}

Carlos Reaño and Federico Silla. Performance Evaluation of the NVIDIA Pascal GPU Architecture: Early Experiences. In 18th IEEE International Conference on High Performance Computing and Communications; 14th IEEE International Conference on Smart City; 2nd IEEE International Conference on Data Science and Systems, HPCC/Smar. 2016, 1234–1235. URL, DOI BibTeX

@conference{DBLP:conf/hpcc/ReanoS16a,
	author = "Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "18th IEEE International Conference on High Performance Computing and Communications; 14th IEEE International Conference on Smart City; 2nd IEEE International Conference on Data Science and Systems, HPCC/Smar",
	crossref = "DBLP:conf/hpcc/2016",
	doi = "10.1109/HPCC-SmartCity-DSS.2016.0173",
	pages = "1234--1235",
	title = "{P}erformance {E}valuation of the {NVIDIA} {P}ascal {GPU} {A}rchitecture: {E}arly {E}xperiences",
	url = "https://doi.org/10.1109/HPCC-SmartCity-DSS.2016.0173",
	year = 2016
}

Carlos Reaño and Federico Silla. Extending rCUDA with Support for P2P Memory Copies between Remote GPUs. In 18th IEEE International Conference on High Performance Computing and Communications; 14th IEEE International Conference on Smart City; 2nd IEEE International Conference on Data Science and Systems, HPCC/Smar. 2016, 789–796. URL, DOI BibTeX

@conference{DBLP:conf/hpcc/ReanoS16,
	author = "Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "18th IEEE International Conference on High Performance Computing and Communications; 14th IEEE International Conference on Smart City; 2nd IEEE International Conference on Data Science and Systems, HPCC/Smar",
	crossref = "DBLP:conf/hpcc/2016",
	doi = "10.1109/HPCC-SmartCity-DSS.2016.0114",
	pages = "789--796",
	title = "{E}xtending r{CUDA} with {S}upport for {P}2{P} {M}emory {C}opies between {R}emote {GPU}s",
	url = "https://doi.org/10.1109/HPCC-SmartCity-DSS.2016.0114",
	year = 2016
}

Javier Prades, Carlos Reaño and Federico Silla. CUDA acceleration for Xen virtual machines in infiniband clusters with rCUDA. In Proceedings of the 21st ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP 2016, Barcelona, Spain, March 12-16, 2016. 2016, 35:1–35:2. URL, DOI BibTeX

@conference{DBLP:conf/ppopp/PradesRS16,
	author = "Prades, Javier and Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "Proceedings of the 21st ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP 2016, Barcelona, Spain, March 12-16, 2016",
	crossref = "DBLP:conf/ppopp/2016",
	doi = "10.1145/2851141.2851181",
	pages = "35:1--35:2",
	title = "{CUDA} acceleration for {X}en virtual machines in infiniband clusters with r{CUDA}",
	url = "http://doi.acm.org/10.1145/2851141.2851181",
	year = 2016
}

Carlos Reaño, Federico Silla and Matthew J Leslie. SchedGPU: Fine-grain dynamic and adaptative scheduling for GPUs. In International Conference on High Performance Computing & Simulation, HPCS 2016, Innsbruck, Austria, July 18-22, 2016. 2016, 993–997. URL, DOI BibTeX

@conference{DBLP:conf/ieeehpcs/ReanoSL16,
	author = "Rea{\~n}o, Carlos and Silla, Federico and Matthew J. Leslie",
	booktitle = "International Conference on High Performance Computing {\&} Simulation, HPCS 2016, Innsbruck, Austria, July 18-22, 2016",
	crossref = "DBLP:conf/ieeehpcs/2016",
	doi = "10.1109/HPCSim.2016.7568444",
	pages = "993--997",
	title = "{S}ched{GPU}: {F}ine-grain dynamic and adaptative scheduling for {GPU}s",
	url = "https://doi.org/10.1109/HPCSim.2016.7568444",
	year = 2016
}

Carlos Reaño and Federico Silla. Reducing the performance gap of remote GPU virtualization with InfiniBand Connect-IB. In IEEE Symposium on Computers and Communication, ISCC 2016, Messina, Italy, June 27-30, 2016. 2016, 920–925. URL, DOI BibTeX

@conference{DBLP:conf/iscc/ReanoS16,
	author = "Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "IEEE Symposium on Computers and Communication, ISCC 2016, Messina, Italy, June 27-30, 2016",
	crossref = "DBLP:conf/iscc/2016",
	doi = "10.1109/ISCC.2016.7543854",
	pages = "920--925",
	title = "{R}educing the performance gap of remote {GPU} virtualization with {I}nfini{B}and {C}onnect-{IB}",
	url = "https://doi.org/10.1109/ISCC.2016.7543854",
	year = 2016
}

Carlos Reaño, Federico Silla, Adrián Castelló, Antonio Pe J na, Rafael Mayo, Enrique S Quintana-Ortí and Jose Duato. Improving the user experience of the rCUDA remote GPU virtualization framework. Concurrency and Computation: Practice and Experience 27(14):3746–3770, 2015. URL, DOI BibTeX

@article{DBLP:journals/concurrency/ReanoSGPMQD15,
	author = "Rea{\~n}o, Carlos and Silla, Federico and Adri{\'a}n Castell{\'o} and Antonio J. Pe na and Rafael Mayo and Enrique S. Quintana-Ort{\'i} and Duato, Jose",
	doi = "10.1002/cpe.3409",
	journal = "Concurrency and Computation: Practice and Experience",
	number = 14,
	pages = "3746--3770",
	title = "{I}mproving the user experience of the r{CUDA} remote {GPU} virtualization framework",
	url = "https://doi.org/10.1002/cpe.3409",
	volume = 27,
	year = 2015
}

Carlos Reaño and Federico Silla. A Live Demo on Remote GPU Accelerated Deep Learning Using the rCUDA Middleware. In Proceedings of the Posters and Demos Session of the 16th International Middleware Conference, Middleware Posters and Demos 2015, Vancouver, BC, Canada, December 7-11, 2015. 2015, 3:1–3:2. URL, DOI BibTeX

@conference{DBLP:conf/middleware/ReanoS15,
	author = "Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "Proceedings of the Posters and Demos Session of the 16th International Middleware Conference, Middleware Posters and Demos 2015, Vancouver, BC, Canada, December 7-11, 2015",
	crossref = "DBLP:conf/middleware/2015pd",
	doi = "10.1145/2830894.2830897",
	pages = "3:1--3:2",
	title = "{A} {L}ive {D}emo on {R}emote {GPU} {A}ccelerated {D}eep {L}earning {U}sing the r{CUDA} {M}iddleware",
	url = "http://doi.acm.org/10.1145/2830894.2830897",
	year = 2015
}

Blesson Varghese, Javier Prades, Carlos Reaño and Federico Silla. Acceleration-as-a-Service: Exploiting Virtualised GPUs for a Financial Application. In 11th IEEE International Conference on e-Science, e-Science 2015, Munich, Germany, August 31 - September 4, 2015. 2015, 47–56. URL, DOI BibTeX

@conference{DBLP:conf/eScience/VarghesePRS15,
	author = "Blesson Varghese and Prades, Javier and Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "11th IEEE International Conference on e-Science, e-Science 2015, Munich, Germany, August 31 - September 4, 2015",
	crossref = "DBLP:conf/eScience/2015",
	doi = "10.1109/eScience.2015.15",
	pages = "47--56",
	title = "{A}cceleration-as-a-{S}ervice: {E}xploiting {V}irtualised {GPU}s for a {F}inancial {A}pplication",
	url = "https://doi.org/10.1109/eScience.2015.15",
	year = 2015
}

Carlos Reaño and Federico Silla. InfiniBand Verbs Optimizations for Remote GPU Virtualization. In 2015 IEEE International Conference on Cluster Computing, CLUSTER 2015, Chicago, IL, USA, September 8-11, 2015. 2015, 825–832. URL, DOI BibTeX

@conference{DBLP:conf/cluster/ReanoS15a,
	author = "Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "2015 IEEE International Conference on Cluster Computing, CLUSTER 2015, Chicago, IL, USA, September 8-11, 2015",
	crossref = "DBLP:conf/cluster/2015",
	doi = "10.1109/CLUSTER.2015.139",
	pages = "825--832",
	title = "{I}nfini{B}and {V}erbs {O}ptimizations for {R}emote {GPU} {V}irtualization",
	url = "https://doi.org/10.1109/CLUSTER.2015.139",
	year = 2015
}

Carlos Reaño and Federico Silla. A Performance Comparison of CUDA Remote GPU Virtualization Frameworks. In 2015 IEEE International Conference on Cluster Computing, CLUSTER 2015, Chicago, IL, USA, September 8-11, 2015. 2015, 488–489. URL, DOI BibTeX

@conference{DBLP:conf/cluster/ReanoS15,
	author = "Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "2015 IEEE International Conference on Cluster Computing, CLUSTER 2015, Chicago, IL, USA, September 8-11, 2015",
	crossref = "DBLP:conf/cluster/2015",
	doi = "10.1109/CLUSTER.2015.76",
	pages = "488--489",
	title = "{A} {P}erformance {C}omparison of {CUDA} {R}emote {GPU} {V}irtualization {F}rameworks",
	url = "https://doi.org/10.1109/CLUSTER.2015.76",
	year = 2015
}

Carlos Reaño, Ferran Perez and Federico Silla. On the Design of a Demo for Exhibiting rCUDA. In 15th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing, CCGrid 2015, Shenzhen, China, May 4-7, 2015. 2015, 1169–1172. URL, DOI BibTeX

@conference{DBLP:conf/ccgrid/ReanoPS15,
	author = "Rea{\~n}o, Carlos and Perez, Ferran and Silla, Federico",
	booktitle = "15th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing, CCGrid 2015, Shenzhen, China, May 4-7, 2015",
	crossref = "DBLP:conf/ccgrid/2015",
	doi = "10.1109/CCGrid.2015.53",
	pages = "1169--1172",
	title = "{O}n the {D}esign of a {D}emo for {E}xhibiting r{CUDA}",
	url = "https://doi.org/10.1109/CCGrid.2015.53",
	year = 2015
}

Carlos Reaño, Federico Silla, Gilad Shainer and Scot Schultz. Local and Remote GPUs Perform Similar with EDR 100G InfiniBand. In Proceedings of the Industrial Track of the 16th International Middleware Conference, Middleware Industry 2015, Vancouver, BC, Canada, December 7-11, 2015. 2015, 4:1–4:7. URL, DOI BibTeX

@conference{DBLP:conf/middleware/ReanoSSS15,
	author = "Rea{\~n}o, Carlos and Silla, Federico and Gilad Shainer and Scot Schultz",
	booktitle = "Proceedings of the Industrial Track of the 16th International Middleware Conference, Middleware Industry 2015, Vancouver, BC, Canada, December 7-11, 2015",
	crossref = "DBLP:conf/middleware/2015i",
	doi = "10.1145/2830013.2830015",
	pages = "4:1--4:7",
	title = "{L}ocal and {R}emote {GPU}s {P}erform {S}imilar with {EDR} 100{G} {I}nfini{B}and",
	url = "http://doi.acm.org/10.1145/2830013.2830015",
	year = 2015
}

Antonio José Peña, Carlos Reaño, Federico Silla, Rafael Mayo, Enrique S Quintana-Ortí and Jose Duato. A complete and efficient CUDA-sharing solution for HPC clusters. Parallel Computing 40(10):574–588, 2014. URL, DOI BibTeX

@article{DBLP:journals/pc/PenaRSMQD14,
	author = "Pe{\~n}a, Antonio Jos{\'e} and Rea{\~n}o, Carlos and Silla, Federico and Rafael Mayo and Enrique S. Quintana-Ort{\'i} and Duato, Jose",
	doi = "10.1016/j.parco.2014.09.011",
	journal = "Parallel Computing",
	number = 10,
	pages = "574--588",
	title = "{A} complete and efficient {CUDA}-sharing solution for {HPC} clusters",
	url = "http://dx.doi.org/10.1016/j.parco.2014.09.011",
	volume = 40,
	year = 2014
}

Carlos Reaño, Federico Silla, Antonio José Peña, Gilad Shainer, Scot Schultz, Adrián Castelló Gimeno, Enrique S Quintana-Ortí and Jose Duato. Boosting the performance of remote GPU virtualization using InfiniBand connect-IB and PCIe 3.0. In 2014 IEEE International Conference on Cluster Computing, CLUSTER 2014, Madrid, Spain, September 22-26, 2014. 2014, 266–267. URL, DOI BibTeX

@conference{DBLP:conf/cluster/ReanoSPSSGQD14,
	author = "Rea{\~n}o, Carlos and Silla, Federico and Pe{\~n}a, Antonio Jos{\'e} and Gilad Shainer and Scot Schultz and Adri{\'a}n Castell{\'o} Gimeno and Enrique S. Quintana-Ort{\'i} and Duato, Jose",
	booktitle = "2014 IEEE International Conference on Cluster Computing, CLUSTER 2014, Madrid, Spain, September 22-26, 2014",
	crossref = "DBLP:conf/cluster/2014",
	doi = "10.1109/CLUSTER.2014.6968737",
	pages = "266--267",
	title = "{B}oosting the performance of remote {GPU} virtualization using {I}nfini{B}and connect-{IB} and {PCI}e 3.0",
	url = "http://dx.doi.org/10.1109/CLUSTER.2014.6968737",
	year = 2014
}

Sergio Iserte, Adrián Castelló Gimeno, Rafael Mayo, Enrique S Quintana-Ortí, Federico Silla, Jose Duato, Carlos Reaño and Javier Prades. SLURM Support for Remote GPU Virtualization: Implementation and Performance Study. In 26th IEEE International Symposium on Computer Architecture and High Performance Computing, SBAC-PAD 2014, Paris, France, October 22-24, 2014. 2014, 318–325. URL, DOI BibTeX

@conference{DBLP:conf/sbac-pad/IserteGMQSDRP14,
	author = "Sergio Iserte and Adri{\'a}n Castell{\'o} Gimeno and Rafael Mayo and Enrique S. Quintana-Ort{\'i} and Silla, Federico and Duato, Jose and Rea{\~n}o, Carlos and Prades, Javier",
	booktitle = "26th IEEE International Symposium on Computer Architecture and High Performance Computing, SBAC-PAD 2014, Paris, France, October 22-24, 2014",
	crossref = "DBLP:conf/sbac-pad/2014",
	doi = "10.1109/SBAC-PAD.2014.49",
	pages = "318--325",
	title = "{SLURM} {S}upport for {R}emote {GPU} {V}irtualization: {I}mplementation and {P}erformance {S}tudy",
	url = "http://dx.doi.org/10.1109/SBAC-PAD.2014.49",
	year = 2014
}

Carlos Reaño, Antonio José Peña, Federico Silla, Rafa Mayo, Enrique S Quintana-Ortí and Jose Duato. Influence of InfiniBand FDR on the Performance of Remote GPU Virtualization. In International Conference on Cluster Computing (Cluster). 2013. BibTeX

@conference{reanoInfluence,
	author = "Rea{\~n}o, Carlos and Pe{\~n}a, Antonio Jos{\'e} and Silla, Federico and Rafa Mayo and Enrique S. Quintana-Ort{\'i} and Duato, Jose",
	booktitle = "International Conference on Cluster Computing (Cluster)",
	title = "{I}nfluence of {I}nfini{B}and {FDR} on the {P}erformance of {R}emote {GPU} {V}irtualization",
	year = 2013
}

Carlos Reaño, Antonio José Peña, Federico Silla, R Mayo, E S Quintana-Ortí and Jose Duato. CU2rCU: towards the Complete rCUDA Remote GPU Virtualization and Sharing Solution. In 19th Annual International Conference on High Performance Computing (HiPC). December 2012. URL BibTeX

@conference{CU2rCU_HiPC12,
	author = "Rea{\~n}o, Carlos and Pe{\~n}a, Antonio Jos{\'e} and Silla, Federico and Mayo, R. and Quintana-Ort{\'i}, E. S. and Duato, Jose",
	booktitle = "19th Annual International Conference on High Performance Computing (HiPC)",
	month = "December",
	title = "{CU}2r{CU}: towards the {C}omplete r{CUDA} {R}emote {GPU} {V}irtualization and {S}haring {S}olution",
	url = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp={\&}arnumber=6507485{\&}isnumber=6507469",
	year = 2012
}

Carlos Reaño, Federico Silla and Germán Vidal. CU2rCU: A CUDA-to-rCUDA Converter. Universitat Politècnica de València, Spain, 2012. URL BibTeX

@mastersthesis{CU2rCU_Master,
	author = "Rea{\~n}o, Carlos and Silla, Federico and Germ{\'a}n Vidal",
	address = "Spain",
	school = "Universitat Polit{\`e}cnica de Val{\`e}ncia",
	title = "{CU}2r{CU}: {A} {CUDA}-to-r{CUDA} {C}onverter",
	url = "http://hdl.handle.net/10251/27435",
	year = 2012
}

@conference{CU2rCU_HiPC2012,
	author = "Rea{\~n}o, Carlos and Pe{\~n}a, Antonio Jos{\'e} and Silla, Federico and Mayo, R. and Quintana-Ort{\'i}, E. S. and Duato, Jose",
	booktitle = "19th Annual International Conference on High Performance Computing (HiPC 2012)",
	title = "{CU}2r{CU}: towards the {C}omplete r{CUDA} {R}emote {GPU} {V}irtualization and {S}haring {S}olution",
	year = 2012
}

Carles Hernández, Antoni Roca, Federico Silla, Jose Flich and Jose Duato. On the Impact of Within-Die Process Variation in GALS-Based NoC Performance. IEEE Trans. on CAD of Integrated Circuits and Systems 31(2):294-307, 2012. BibTeX

@article{DBLP:journals/tcad/HernandezRSFD12,
	author = "Hern{\'a}ndez, Carles and Roca, Antoni and Silla, Federico and Flich, Jose and Duato, Jose",
	journal = "IEEE Trans. on CAD of Integrated Circuits and Systems",
	number = 2,
	pages = "294-307",
	title = "{O}n the {I}mpact of {W}ithin-{D}ie {P}rocess {V}ariation in {GALS}-{B}ased {N}o{C} {P}erformance",
	volume = 31,
	year = 2012
}

Alesandro Strano, Carles Hernández, Federico Silla and Davide Bertozzi. Self-Calibrating Source Synchronous Communication for Delay Variation Tolerant GALS Network-on-Chip Design. International Journal of Embedded and Real-Time Communication Systems (IJERTCS) 2(4):20, October 2011. DOI BibTeX

@article{1947-317,
	author = "Alesandro Strano and Hern{\'a}ndez, Carles and Silla, Federico and Davide Bertozzi",
	doi = "doi:10.4018/jertcs.2011100101",
	issn = "1947-3176",
	journal = "International Journal of Embedded and Real-Time Communication Systems (IJERTCS)",
	month = "October",
	number = 4,
	pages = 20,
	title = "{S}elf-{C}alibrating {S}ource {S}ynchronous {C}ommunication for {D}elay {V}ariation {T}olerant {GALS} {N}etwork-on-{C}hip {D}esign",
	volume = 2,
	year = 2011
}

Carles Hernández, Federico Silla and Jose Duato. Energy and Performance Efficient Thread Mapping in NoC-Based CMPs under Process Variations. In Parallel Processing (ICPP), 2011 International Conference on. 2011, 41 -50. DOI BibTeX

@conference{6047171,
	author = "Hern{\'a}ndez, Carles and Silla, Federico and Duato, Jose",
	abstract = "Within-die process variation causes cores, memories, and network resources in NoC-based CMPs to present different speeds and leakage power. In this context, thread mapping strategies that consider the effects of process variability on chip resources arise as a suitable choice to maximize performance while energy consumption constraints are satisfied. However, other factors, as the location of memory controllers and the concurrent execution of several applications in the chip, can bound the possible benefits of such mapping strategies. In this paper we propose a mapping strategy, named as uniform regions, that takes variability effects into account when assigning application threads to cores in the chip. More specifically, uniform regions, in terms of operating frequency, that additionally present the highest available frequency, are selected so that the benefits of such a variation-aware mapping strategy in a NoC-based CMP are maximized. We additionally present two different ways of configuring the frequency and voltage of the cores in the selected region. The first one is intended to provide the maximum performance while keeping energy as low as possible, while the second one is much more for energy-aware. The first one reduces the execution time up to a 23 #x025; while reducing the energy up to 24 #x025; whereas the second one provides smaller speed ups while reduces energy up to 33 #x025;.",
	booktitle = "Parallel Processing (ICPP), 2011 International Conference on",
	doi = "10.1109/ICPP.2011.48",
	issn = "0190-3918",
	month = "sept.",
	pages = "41 -50",
	title = "{E}nergy and {P}erformance {E}fficient {T}hread {M}apping in {N}o{C}-{B}ased {CMP}s under {P}rocess {V}ariations",
	year = 2011
}

Antoni Roca, Carles Hernández, Jose Flich, Federico Silla and Jose Duato. A Distributed Switch Architecture for On-Chip Networks. In Parallel Processing (ICPP), 2011 International Conference on. 2011, 21 -30. DOI BibTeX

@conference{6047169,
	author = "Roca, Antoni and Hern{\'a}ndez, Carles and Flich, Jose and Silla, Federico and Duato, Jose",
	abstract = "It is well-known that current Chip Multiprocessor (CMP) and high-end MultiProcessor System-on-Chip (MPSoC) designs are growing in their number of components. Networks-on-Chip (NoC) provide the required connectivity for such CMP and MPSoC designs at reasonable costs. However, as technology advances, links become the critical component in the NoC. First, because the power consumption of the link is extremely high with respect the power consumption of the rest of components (mainly switches), becoming unacceptable for long global interconnects. Second, the delay of a link does not scale with technology, thus, degrading the performance of the network. To solve both problems, several solutions have been previously proposed. In this paper, we present a new switch architecture that reduces the negative impact of links on the NoC. We call our proposal distributed switch. The distributed switch moves the circuitry of a standard switch onto the links. Then, packets are buffered, routed, and forwarded at the same time they are crossing the link. Distributing a standard switch onto the link improves the trade off between the power consumption and the operating frequency of the entire network. In contrast, area requirements are increased. The distributed switch reduces up to 14.8 #x025; the peak power consumption while increases its area up to 22 #x025;. Furthermore, the distributed switch is able to increase the maximum achievable frequency with respect to the standard switch. In particular, the maximum operating frequency of the distributed switch can be increased up to 14.3 #x025;.",
	booktitle = "Parallel Processing (ICPP), 2011 International Conference on",
	doi = "10.1109/ICPP.2011.28",
	issn = "0190-3918",
	month = "sept.",
	pages = "21 -30",
	title = "{A} {D}istributed {S}witch {A}rchitecture for {O}n-{C}hip {N}etworks",
	year = 2011
}

Jose Duato, Antonio José Peña, Federico Silla, Rafael Mayo and Enrique S Quintana-Orti. Performance of CUDA Virtualized Remote GPUs in High Performance Clusters. In Parallel Processing (ICPP), 2011 International Conference on. 2011, 365 -374. DOI BibTeX

@conference{6047204,
	author = "Duato, Jose and Pe{\~n}a, Antonio Jos{\'e} and Silla, Federico and Rafael Mayo and Enrique S. Quintana-Orti",
	abstract = "In a previous work we presented the architecture of rCUDA, a middleware that enables CUDA remoting over a commodity network. That is, the middleware allows an application to use a CUDA-compatible Graphics Processor (GPU) installed in a remote computer as if it were installed in the computer where the application is being executed. This approach is based on the observation that GPUs in a cluster are not usually fully utilized, and it is intended to reduce the number of GPUs in the cluster, thus lowering the costs related with acquisition and maintenance while keeping performance close to that of the fully-equipped configuration. In this paper we model rCUDA over a series of high throughput networks in order to assess the influence of the performance of the underlying network on the performance of our virtualization technique. For this purpose, we analyze the traces of two different case studies over two different networks. Using this data, we calculate the expected performance for these same case studies over a series of high throughput networks, in order to characterize the expected behavior of our solution in high performance clusters. The estimations are validated using real 1 Gbps Ethernet and 40 Gbps InfiniBand networks, showing an error rate in the order of 1 #x025; for executions involving data transfers above 40 MB. In summary, although our virtualization technique noticeably increases execution time when using a 1 Gbps Ethernet network, it performs almost as efficiently as a local GPU when higher performance interconnects are used. Therefore, the small overhead incurred by our proposal because of the remote use of GPUs is worth the savings that a cluster configuration with less GPUs than nodes reports.",
	booktitle = "Parallel Processing (ICPP), 2011 International Conference on",
	doi = "10.1109/ICPP.2011.58",
	issn = "0190-3918",
	month = "sept.",
	pages = "365 -374",
	title = "{P}erformance of {CUDA} {V}irtualized {R}emote {GPU}s in {H}igh {P}erformance {C}lusters",
	year = 2011
}

Samuel Rodrigo, Jose Flich, Antoni Roca, S Medardoni, D Bertozzi, , Federico Silla and Jose Duato. Cost-Efficient On-Chip Routing Implementations for CMP and MPSoC Systems. Computer-Aided Design of Integrated Circuits and Systems, IEEE Transactions on 30(4):534 -547, April 2011. URL, DOI BibTeX

@article{5737867,
	author = "Rodrigo, Samuel and Flich, Jose and Roca, Antoni and S. Medardoni and D. Bertozzi and , and Silla, Federico and Duato, Jose",
	abstract = "The high-performance computing domain is enriching with the inclusion of networks-on-chip (NoCs) as a key component of many-core (CMPs or MPSoCs) architectures. NoCs face the communication scalability challenge while meeting tight power, area, and latency constraints. Designers must address new challenges that were not present before. Defective components, the enhancement of application-level parallelism, or power-aware techniques may break topology regularity, thus, efficient routing becomes a challenge. This paper presents universal logic-based distributed routing (uLBDR), an efficient logic-based mechanism that adapts to any irregular topology derived from 2-D meshes, instead of using routing tables. uLBDR requires a small set of configuration bits, thus being more practical than large routing tables implemented in memories. Several implementations of uLBDR are presented highlighting the tradeoff between routing cost and coverage. The alternatives span from the previously proposed LBDR approach (with 30% of coverage) to the uLBDR mechanism achieving full coverage. This comes with a small performance cost, thus exhibiting the tradeoff between fault tolerance and performance. Power consumption, area, and delay estimates are also provided highlighting the efficiency of the mechanism. To do this, different router models (one for CMPs and one for MPSoCs) have been designed as a proof concept.",
	doi = "10.1109/TCAD.2011.2119150",
	issn = "0278-0070",
	journal = "Computer-Aided Design of Integrated Circuits and Systems, IEEE Transactions on",
	keywords = "Fault-tolerance , logic design , networks-on-chip , routing",
	month = "april",
	number = 4,
	pages = "534 -547",
	title = "{C}ost-{E}fficient {O}n-{C}hip {R}outing {I}mplementations for {CMP} and {MPS}o{C} {S}ystems",
	url = "http://dx.doi.org/10.1109/TCAD.2011.2119150",
	volume = 30,
	year = 2011
}

Jose Duato, Antonio José Peña, Federico Silla, Rafael Mayo and Enrique S Quintana-Ort. Enabling CUDA acceleration within virtual machines using rCUDA. Proceedings of HiPC 2011, 2011. URL BibTeX

@article{N/A,
	author = "Duato, Jose and Pe{\~n}a, Antonio Jos{\'e} and Silla, Federico and Rafael Mayo and Enrique S. Quintana-Ort",
	abstract = "The hardware and software advances of Graphics Processing Units (GPUs) have favored the develop- ment of GPGPU (General-Purpose Computation on GPUs) and its adoption in many scientific, engineering, and industrial areas. Thus, GPUs are increasingly being introduced in high-performance computing systems as well as in datacenters. On the other hand, virtualization technologies are also receiving rising interest in these domains, because of their many benefits on acquisition and maintenance savings. There are currently several works on GPU virtualization. However, there is no standard solution allowing access to GPGPU capabilities from virtual machine environments like, e.g., VMware, Xen, VirtualBox, or KVM. Such lack of a standard solution is delaying the integration of GPGPU into these domains.",
	journal = "Proceedings of HiPC 2011",
	keywords = "Virtual machine;rCUDA",
	note = "Clusters;CUDA;High performance computing;Virtualizations;",
	title = "{E}nabling {CUDA} acceleration within virtual machines using r{CUDA}",
	url = "http://www.hipc.org/hipc2011/program.php",
	year = 2011
}

Carles Hernández, Antoni Roca, Jose Flich, Federico Silla and Jose Duato. Fault-Tolerant Vertical Link Design for Effective 3D Stacking. IEEE Computer Architecture Letters 99(RapidPosts), 2011. URL, DOI BibTeX

@article{10.1109/L-CA.2011.17,
	author = "Hern{\'a}ndez, Carles and Roca, Antoni and Flich, Jose and Silla, Federico and Duato, Jose",
	address = "Los Alamitos, CA, USA",
	doi = "10.1109/L-CA.2011.17",
	issn = "1556-6056",
	journal = "IEEE Computer Architecture Letters",
	number = "RapidPosts",
	publisher = "IEEE Computer Society",
	title = "{F}ault-{T}olerant {V}ertical {L}ink {D}esign for {E}ffective 3{D} {S}tacking",
	url = "http://doi.ieeecomputersociety.org/10.1109/L-CA.2011.17",
	volume = 99,
	year = 2011
}

Samuel Rodrigo, Jose Flich, Antoni Roca, S Medardoni, D Bertozzi, , Federico Silla and Jose Duato. Cost-efficient on-chip routing implementations for CMP and MPSoC systems. 2011, 534 - 547. URL, DOI BibTeX

@conference{20111313880819,
	author = "Rodrigo, Samuel and Flich, Jose and Roca, Antoni and S. Medardoni and D. Bertozzi and , and Silla, Federico and Duato, Jose",
	abstract = "The high-performance computing domain is enriching with the inclusion of networks-on-chip (NoCs) as a key component of many-core (CMPs or MPSoCs) architectures. NoCs face the communication scalability challenge while meeting tight power, area, and latency constraints. Designers must address new challenges that were not present before. Defective components, the enhancement of application-level parallelism, or power-aware techniques may break topology regularity, thus, efficient routing becomes a challenge. This paper presents universal logic-based distributed routing (uLBDR), an efficient logic-based mechanism that adapts to any irregular topology derived from 2-D meshes, instead of using routing tables. uLBDR requires a small set of configuration bits, thus being more practical than large routing tables implemented in memories. Several implementations of uLBDR are presented highlighting the tradeoff between routing cost and coverage. The alternatives span from the previously proposed LBDR approach (with 30% of coverage) to the uLBDR mechanism achieving full coverage. This comes with a small performance cost, thus exhibiting the tradeoff between fault tolerance and performance. Power consumption, area, and delay estimates are also provided highlighting the efficiency of the mechanism. To do this, different router models (one for CMPs and one for MPSoCs) have been designed as a proof concept. © 2006 IEEE.",
	address = "445 Hoes Lane / P.O. Box 1331, Piscataway, NJ 08855-1331, United States",
	doi = "10.1109/TCAD.2011.2119150",
	issn = 02780070,
	journal = "IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems",
	key = "Fault tolerance",
	keywords = "Computer software selection and evaluation;Logic design;Microprocessor chips;Quality assurance;Telecommunication networks;Topology;",
	note = "Cost-efficient;Distributed routing;Efficient routing;High-performance computing;Irregular topology;Key component;Latency constraints;Many-core;Networks on chips;networks-on-chip;On chips;Performance costs;Power Consumption;Power-aware;Router model;routing;Routing table;Universal logic;",
	number = 4,
	pages = "534 - 547",
	title = "{C}ost-efficient on-chip routing implementations for {CMP} and {MPS}o{C} systems",
	url = "http://dx.doi.org/10.1109/TCAD.2011.2119150",
	volume = 30,
	year = 2011
}

Carles Hernández, Antoni Roca, Jose Flich, Federico Silla and Jose Duato. Characterizing the impact of process variation on 45 nm NoC-based CMPs. Journal of Parallel and Distributed Computing 71(5):651 - 663, 2011. URL, DOI BibTeX

@article{20111413888254,
	author = "Hern{\'a}ndez, Carles and Roca, Antoni and Flich, Jose and Silla, Federico and Duato, Jose",
	abstract = "Current integration scales make possible to design chip multiprocessors with a large amount of cores interconnected by a NoC. Unfortunately, they also bring process variation, posing a new burden to processor manufacturers. Regarding the NoC, variability causes that the delays of links and routers do not match those initially established at design time. In this paper we analyze how variability affects the NoC by applying a new variability model to 100 instances of an 8 × 8 mesh NoC synthesized using 45 nm technology. We also show that GALS-based NoCs present communication bottlenecks due to the slower components of the network, which cause congestion, thus reducing performance. This performance reduction finally affects the applications being executed in the CMP because they may be mapped to slower areas of the chip. In this paper we show that using a mapping algorithm that considers variability data may improve application execution time up to 50%. © 2010 Elsevier Inc. All rights reserved.",
	address = "6277 Sea Harbor Drive, Orlando, FL 32887-4900, United States",
	doi = "10.1016/j.jpdc.2010.09.006",
	issn = "0743-7315",
	journal = "Journal of Parallel and Distributed Computing",
	key = "Routers",
	keywords = "Conformal mapping;Design;Microprocessor chips;Multiprocessing systems;Servers;Systems analysis;VLSI circuits;",
	note = "Chip Multiprocessor;NoC (or Network-on-Chip);Process mapping;Process variations;Router design;",
	number = 5,
	pages = "651 - 663",
	title = "{C}haracterizing the impact of process variation on 45 nm {N}o{C}-based {CMP}s",
	url = "http://dx.doi.org/10.1016/j.jpdc.2010.09.006",
	volume = 71,
	year = 2011
}

Samuel Rodrigo, Jose Flich, Antoni Roca, S Medardoni, D Bertozzi, , Federico Silla and Jose Duato. Cost-Efficient On-Chip Routing Implementations for CMP and MPSoC Systems. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 30(4):534 - 47, 2011. URL, DOI BibTeX

@article{11874902,
	author = "Rodrigo, Samuel and Flich, Jose and Roca, Antoni and S. Medardoni and D. Bertozzi and , and Silla, Federico and Duato, Jose",
	abstract = "The high-performance computing domain is enriching with the inclusion of networks-on-chip (NoCs) as a key component of many-core (CMPs or MPSoCs) architectures. NoCs face the communication scalability challenge while meeting tight power, area, and latency constraints. Designers must address new challenges that were not present before. Defective components, the enhancement of application-level parallelism, or power-aware techniques may break topology regularity, thus, efficient routing becomes a challenge. This paper presents universal logic-based distributed routing (uLBDR), an efficient logic-based mechanism that adapts to any irregular topology derived from 2-D meshes, instead of using routing tables. uLBDR requires a small set of configuration bits, thus being more practical than large routing tables implemented in memories. Several implementations of uLBDR are presented highlighting the tradeoff between routing cost and coverage. The alternatives span from the previously proposed LBDR approach (with 30% of coverage) to the uLBDR mechanism achieving full coverage. This comes with a small performance cost, thus exhibiting the tradeoff between fault tolerance and performance. Power consumption, area, and delay estimates are also provided highlighting the efficiency of the mechanism. To do this, different router models (one for CMPs and one for MPSoCs) have been designed as a proof concept.",
	address = "USA",
	doi = "10.1109/TCAD.2011.2119150",
	issn = "0278-0070",
	journal = "IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems",
	keywords = "microprocessor chips;network routing;network-on-chip;",
	note = "cost-efficient on-chip routing implementations;chip multiprocessors;CMP;MPSoC Systems;many-core system-on-chip;networks-on-chip;communication scalability;latency constraints;area constraints;power constraints;application-level parallelism;power-aware techniques;topology regularity;universal logic-based distributed routing;logic-based mechanism;2D meshes;fault tolerance;fault performance;power consumption;",
	number = 4,
	pages = "534 - 47",
	title = "{C}ost-{E}fficient {O}n-{C}hip {R}outing {I}mplementations for {CMP} and {MPS}o{C} {S}ystems",
	url = "http://dx.doi.org/10.1109/TCAD.2011.2119150",
	volume = 30,
	year = 2011
}

Carles Hernández, Federico Silla and Jose Duato. Energy and Performance Efficient Thread Mapping in NoC-Based CMPs under Process Variations. In ICPP. 2011, 41-50. BibTeX

@conference{DBLP:conf/icpp/HernandezSD11,
	author = "Hern{\'a}ndez, Carles and Silla, Federico and Duato, Jose",
	booktitle = "ICPP",
	crossref = "DBLP:conf/icpp/2011",
	pages = "41-50",
	title = "{E}nergy and {P}erformance {E}fficient {T}hread {M}apping in {N}o{C}-{B}ased {CMP}s under {P}rocess {V}ariations",
	year = 2011
}

Héctor Montaner, Federico Silla, Holger Froning and Jose Duato. A new degree of freedom for memory allocation in clusters. Cluster Computing, pages 1 - 23, 2011. URL BibTeX

@article{IP51265029,
	author = "Montaner, H{\'e}ctor and Silla, Federico and Holger Froning and Duato, Jose",
	abstract = "Improvements in parallel computing hardware usually involve increments in the number of available resources for a given application such as the number of computing cores and the amount of memory. In the case of shared-memory computers, the increase in computing resources and available memory is usually constrained by the coherency protocol, whose overhead rises with system size, limiting the scalability of the final system. In this paper we propose an efficient and cost-effective way to increase the memory available for a given application by leveraging free memory in other computers in the cluster. Our proposal is based on the observation that many applications benefit from having more memory resources but do not require more computing cores, thus reducing the requirements for cache coherency and allowing a simpler implementation and better scalability. Simulation results show that, when additional mechanisms intended to hide remote memory latency are used, execution time of applications that use our proposal is similar to the time required to execute them in a computer populated with enough local memory, thus validating the feasibility of our proposal. We are currently building a prototype that implements our ideas. The first results from real executions in this prototype demonstrate not only that our proposal works but also that it can efficiently execute applications that make use of remote memory resources. {\&}copy; 2011 Springer Science+Business Media, LLC.",
	issn = 13867857,
	journal = "Cluster Computing",
	key = "Computer simulation",
	keywords = "Parallel architectures;Scalability;",
	note = "Cache coherency;Computing resource;Degree of freedom;Execution time;Free memory;Local memories;Memory allocation;Memory resources;Parallel Computing;Remote memory;Shared-memory computers;Simulation result;System size;",
	pages = "1 - 23",
	title = "{A} new degree of freedom for memory allocation in clusters",
	url = "http://dx.doi.org/10.1007/s10586-010-0150-7",
	year = 2011
}

Antoni Roca, Jose Flich, Federico Silla and Jose Duato. VCTlite: Towards an Efficient Implementation of Virtual Cut-Through Switching in On-Chip Networks. In 17th Int'l Conference on High Performance Computing (HiPC) In Press. December 2010. BibTeX

@conference{roca-hipc10,
	author = "Roca, Antoni and Flich, Jose and Silla, Federico and Duato, Jose",
	address = "Goa,India",
	booktitle = "17th Int'l Conference on High Performance Computing (HiPC)",
	keywords = "on-chip networks; switching;",
	month = "December",
	title = "{VCT}lite: {T}owards an {E}fficient {I}mplementation of {V}irtual {C}ut-{T}hrough {S}witching in {O}n-{C}hip {N}etworks",
	volume = "In Press",
	year = 2010
}

D Flich J.; Bertozzi (ed.). Designing Network On-Chip Architectures in the Nanoscale Era. CRC Press, December 2010. URL BibTeX

@book{365336,
	author = "Gilabert, Francisco and Silla, Federico and Gomez, Maria E. and Lodde, Mario and Roca, Antoni and Flich, Jose and Duato, Jose and Hern{\'a}ndez, Carles and Rodrigo, Samuel",
	abstract = "Going beyond isolated research ideas and design experiences, Designing Network On-Chip Architectures in the Nanoscale Era covers the foundations and design methods of network on-chip (NoC) technology. The contributors draw on their own lessons learned to provide strong practical guidance on various design issues. Exploring the design process of the network, the first part of the book focuses on basic aspects of switch architecture and design, topology selection, and routing implementation. In the second part, contributors discuss their experiences in the industry, offering a roadmap to recent products. They describe Tilera’s TILE family of multicore processors, novel Intel products and research prototypes, and the TRIPS operand network (OPN). The last part reveals state-of-the-art solutions to hardware-related issues and explains how to efficiently implement the programming model at the network interface. In the appendix, the microarchitectural details of two switch architectures targeting multiprocessor system-on-chips (MPSoCs) and chip multiprocessors (CMPs) can be used as an experimental platform for running tests. A stepping stone to the evolution of future chip architectures, this volume provides a how-to guide for designers of current NoCs as well as designers involved with 2015 computing platforms. It cohesively brings together fundamental design issues, alternative design paradigms and techniques, and the main design tradeoffs—consistently focusing on topics most pertinent to real-world NoC designers.",
	editor = "Flich, J.; Bertozzi, D.",
	isbn = 9781439837108,
	keywords = "Network on chip;Chip Architectures;",
	month = "December",
	publisher = "CRC Press",
	title = "{D}esigning {N}etwork {O}n-{C}hip {A}rchitectures in the {N}anoscale {E}ra",
	url = "http://www.crcpress.com/product/isbn/9781439837108",
	year = 2010
}

A Strano, Carles Hernández, Federico Silla and D Bertozzi. Process variation and layout mismatch tolerant design of source synchronous links for GALS networks-on-chip. In System on Chip (SoC), 2010 International Symposium on. 2010, 43 -48. URL, DOI BibTeX

@conference{5625539,
	author = "A. Strano and Hern{\'a}ndez, Carles and Silla, Federico and D. Bertozzi",
	abstract = "Synchronization interfaces in a network-on-chip (NoC) are becoming vulnerable points that need to be safeguarded against link delay variations and signal misalignments. This paper addresses the challenge of designing a process variation and layout mismatch tolerant link for GALS NoCs by implementing a self-calibration mechanism. A variation detector senses the variability-induced misalignment between data lines with themselves and with the transmitter clock routed with data in source synchronous links. Then, a suitable delayed replica of the transmitter clock is selected for safe sampling of misaligned data. The paper proves correct operation of the GALS link augmented with the variation detector and compares its reliability with that of a detector-less link, beyond proving robustness with respect to the delay variability affecting the detector itself.",
	booktitle = "System on Chip (SoC), 2010 International Symposium on",
	doi = "10.1109/ISSOC.2010.5625539",
	isbn = "978-1-4244-8279-5",
	keywords = "GALS networks-on-chip;layout mismatch tolerant design;link delay variations;process variation;self-calibration mechanism;signal misalignments;source synchronous links;synchronization interfaces;transmitter clock;delays;integrated circuit layout;network-on",
	month = "sept.",
	pages = "43 -48",
	title = "{P}rocess variation and layout mismatch tolerant design of source synchronous links for {GALS} networks-on-chip",
	url = "http://dx.doi.org/10.1109/ISSOC.2010.5625539",
	year = 2010
}

Antoni Roca, Jose Flich, Federico Silla and Jose Duato. A Latency-Efficient Router Architecture for CMP Systems. In Digital System Design: Architectures, Methods and Tools (DSD), 2010 13th Euromicro Conference on. 2010, 165 -172. URL, DOI BibTeX

@conference{5615623,
	author = "Roca, Antoni and Flich, Jose and Silla, Federico and Duato, Jose",
	abstract = "As technology advances, the number of cores in Chip Multi Processor systems (CMPs) and Multi Processor Systems-on-Chips (MPSoCs) keeps increasing. Current test chips and products reach tens of cores, and it is expected to reach hundreds of cores in the near future. Such complexity demands for an efficient network-on-chip (NoC). The common choice to build such networks is the 2D mesh topology (as it matches the regular tile-based design) and the Dimension-Order Routing (DOR) algorithm (because its simplicity). The network in such systems must provide sustained throughput and ultra low latencies. One of the key components in the network is the router, and thus, it plays a major role when designing for such performance levels. In this paper we propose a new pipelined router design focused in reducing the router latency. As a first step we identify the router components that take most of the critical path, and thus limit the router frequency. In particular, the arbiter is the one limiting the performance of the router. Based on this fact, we simplify the arbiter logic by using multiple smaller arbiters. The initial set of requests in the initial arbiter is then distributed over the smaller arbiters that operate in parallel. With this design procedure, and with a proper internal router organization, different router architectures are evolved. All of them enable the use of smaller arbiters in parallel by replicating ports and assuming the use of the DOR algorithm. The net result of such changes is a faster router. Preliminary results demonstrate a router latency reduction ranging from 10 #x025; to 21 #x025; with an increase of the router area. Network latency is reduced in a range from 11% to 15%.",
	booktitle = "Digital System Design: Architectures, Methods and Tools (DSD), 2010 13th Euromicro Conference on",
	doi = "10.1109/DSD.2010.42",
	isbn = "978-1-4244-7839-2",
	keywords = "arbiter design;low latency router;network-on-chip;router architecture;router design",
	month = "sept.",
	pages = "165 -172",
	title = "{A} {L}atency-{E}fficient {R}outer {A}rchitecture for {CMP} {S}ystems",
	url = "http://dx.doi.org/10.1109/DSD.2010.42",
	year = 2010
}

Héctor Montaner, Federico Silla, H Fröning and Jose Duato. Getting Rid of Coherency Overhead for Memory-Hungry Applications. In Cluster Computing (CLUSTER), 2010 IEEE International Conference on. 2010, 48 -57. URL, DOI BibTeX

@conference{5600323,
	author = {Montaner, H{\'e}ctor and Silla, Federico and H. Fr{\"o}ning and Duato, Jose},
	abstract = "Current commercial solutions intended to provide additional resources to an application being executed in a cluster usually aggregate processors and memory from different nodes. In this paper we present a 16-node prototype for a shared-memory cluster architecture that follows a different approach by decoupling the amount of memory available to an application from the processing resources assigned to it. In this way, we provide a new degree of freedom so that the memory granted to a process can be expanded with the memory from other nodes in the cluster without increasing the number of processors used by the program. This feature is especially suitable for memory-hungry applications that demand large amounts of memory but present a parallelization level that prevents them from using more cores than available in a single node. The main advantage of this approach is that an application can use more memory from other nodes without involving the processors, and caches, from those nodes. As a result, using more memory no longer implies increasing the coherence protocol overhead because the number of caches involved in the coherent domain has become independent from the amount of available memory. The prototype we present in this paper leverages this idea by sharing 128GB of memory among the cluster. Real executions show the feasibility of our prototype and its scalability.",
	booktitle = "Cluster Computing (CLUSTER), 2010 IEEE International Conference on",
	doi = "10.1109/CLUSTER.2010.14",
	keywords = "16-node prototype;coherence protocol overhead;coherent domain;memory decoupling;memory hungry application;parallelization level;processing resource;shared memory cluster architecture;cache storage;memory architecture;pattern clustering;program processors;",
	month = "sept.",
	pages = "48 -57",
	title = "{G}etting {R}id of {C}oherency {O}verhead for {M}emory-{H}ungry {A}pplications",
	url = "http://dx.doi.org/10.1109/CLUSTER.2010.14",
	year = 2010
}

Héctor Montaner, Federico Silla and Jose Duato. A practical way to extend shared memory support beyond a motherboard at low cost. In Proceedings of the 19th ACM International Symposium on High Performance Distributed Computing. June 2010, 155-166. URL, DOI BibTeX

@conference{Montaner:2010:PWE:1851476.1851495,
	author = "Montaner, H{\'e}ctor and Silla, Federico and Duato, Jose",
	abstract = "Improvements in parallel computing hardware usually involve increments in the number of available resources for a given application such as the number of computing cores and the amount of memory. In the case of shared-memory computers, the increase in computing resources and available memory is usually constrained by the coherency protocol, whose overhead rises with system size, limiting the scalability of the final system. In this paper we propose an efficient and cost-effective way to increase the memory available for a given application by leveraging free memory in other computers in the cluster. Our proposal is based on the observation that many applications benefit from having more memory resources but do not require more computing cores, thus reducing the requirements for cache coherency and allowing a simpler implementation and better scalability. Simulation results show that, when additional mechanisms intended to hide remote memory latency are used, execution time of applications that use our proposal is similar to the time required to execute them in a computer populated with enough local memory, thus validating the feasibility of our proposal. We are currently building a prototype that implements our ideas.",
	address = "Chicago, Illinois",
	booktitle = "Proceedings of the 19th ACM International Symposium on High Performance Distributed Computing",
	doi = "10.1145/1851476.1851495",
	isbn = "978-1-60558-942-8",
	keywords = "memory;",
	month = "June",
	pages = "155-166",
	publisher = "ACM",
	series = "HPDC '10",
	title = "{A} practical way to extend shared memory support beyond a motherboard at low cost",
	url = "http://doi.acm.org/10.1145/1851476.1851495",
	year = 2010
}

Samuel Rodrigo, Jose Flich, Antoni Roca, S Medardoni, D Bertozzi, , Federico Silla and Jose Duato. Addressing Manufacturing Challenges with Cost-Efficient Fault Tolerant Routing. In Networks-on-Chip (NOCS), 2010 Fourth ACM/IEEE International Symposium on. May 2010, 25 -32. URL, DOI BibTeX

@conference{5507564,
	author = "Rodrigo, Samuel and Flich, Jose and Roca, Antoni and S. Medardoni and D. Bertozzi and , and Silla, Federico and Duato, Jose",
	abstract = "The high-performance computing domain is enriching with the inclusion of Networks-on-chip (NoCs) as a key component of many-core (CMPs or MPSoCs) architectures. NoCs face the communication scalability challenge while meeting tight power, area and latency constraints. Designers must address new challenges that were not present before. Defective components, the enhancement of application-level parallelism or power-aware techniques may break topology regularity, thus, efficient routing becomes a challenge.In this paper, uLBDR (Universal Logic-Based Distributed Routing) is proposed as an efficient logic-based mechanism that adapts to any irregular topology derived from 2D meshes, being an alternative to the use of routing tables (either at routers or at end-nodes). uLBDR requires a small set of configuration bits, thus being more practical than large routing tables implemented in memories. Several implementations of uLBDR are presented highlighting the trade-off between routing cost and coverage. The alternatives span from the previously proposed LBDR approach (with 30% of coverage) to the uLBDR mechanism achieving full coverage. This comes with a small performance cost, thus exhibiting the trade-off between fault tolerance and performance.",
	booktitle = "Networks-on-Chip (NOCS), 2010 Fourth ACM/IEEE International Symposium on",
	doi = "10.1109/NOCS.2010.12",
	keywords = "NoC;addressing manufacturing challenges;application level parallelism;cost efficient fault tolerant routing;logic based mechanism;networks-on-chip;power aware techniques;universal logic based distributed routing;network routing;network topology;network-on",
	month = "may",
	pages = "25 -32",
	title = "{A}ddressing {M}anufacturing {C}hallenges with {C}ost-{E}fficient {F}ault {T}olerant {R}outing",
	url = "http://dx.doi.org/10.1109/NOCS.2010.12",
	year = 2010
}

Carles Hernández, Antoni Roca, Federico Silla, Jose Flich and Jose Duato. Improving the Performance of GALS-Based NoCs in the Presence of Process Variation. In 2010 ACM/IEEE International Symposium on Networks-on-Chip (NOCS). May 2010, 35 - 42. URL, DOI BibTeX

@conference{11416504,
	author = "Hern{\'a}ndez, Carles and Roca, Antoni and Silla, Federico and Flich, Jose and Duato, Jose",
	abstract = "Current integration scales allow designing chip multiprocessors (CMP) where cores are interconnected by means of a network-on-chip (NoC). Unfortunately, the small feature size of current integration scales cause some unpredictability in manufactured devices because of process variation. In NoCs,variability may affect links and routers causing that they do not match the parameters established at design time. In this paper we first analyze the way that manufacturing deviations affect the components of a NoC by applying a comprehensive and detailed variability model to 200 instances of an 8×8 mesh NoC synthesized using 45 nm technology. A second contribution of this paper is showing that GALS-based NoCs present communication bottlenecks under process variation. To overcome this performance reduction we draft a novel approach, called performance domains, intended to reduce the negative impact of variability on application execution time. This mechanism is suitable when several applications are simultaneously running in the CMP chip.",
	address = "Grenoble, France",
	booktitle = "2010 ACM/IEEE International Symposium on Networks-on-Chip (NOCS)",
	doi = "10.1109/NOCS.2010.13",
	journal = "2010 ACM/IEEE International Symposium on Networks-on-Chip (NOCS)",
	keywords = "integrated circuit design;large scale integration;network-on-chip;performance evaluation;",
	month = "May",
	note = "GALS-based NoCs;chip multiprocessors;network-on-chip;manufacturing deviations;process variation;performance domains;integration scales;",
	pages = "35 - 42",
	publisher = "ACM",
	title = "{I}mproving the {P}erformance of {GALS}-{B}ased {N}o{C}s in the {P}resence of {P}rocess {V}ariation",
	url = "http://dx.doi.org/10.1109/NOCS.2010.13",
	year = 2010
}

Carles Hernández, Federico Silla and Jose Duato. A Methodology for the Characterization of Process Variation in NoC Links. In 2010 Design, Automation & Test in Europe Conference & Exhibition (DATE 2010). March 2010, 685-690. URL BibTeX

@conference{11283352,
	author = "Hern{\'a}ndez, Carles and Silla, Federico and Duato, Jose",
	abstract = "Associated with the ever growing integration scales is the increase in process variability. In the context of network-on-chip, this variability affects the maximum frequency that could be sustained by each link that interconnects two cores in a chip multiprocessor. In this paper we present a methodology to model delay variations in NoC links. We also show its application to several technologies, namely 45nm, 32nm, 22nm, and 16nm. Simulation results show that conclusions about variability greatly depend on the implementation context.",
	address = "Dresden, Germany",
	booktitle = "2010 Design, Automation {\&} Test in Europe Conference {\&} Exhibition (DATE 2010)",
	isbn = "978-3-9810801-6-2",
	journal = "2010 Design, Automation {\&}amp; Test in Europe Conference {\&}amp; Exhibition (DATE 2010)",
	keywords = "multiprocessor interconnection networks;network-on-chip;",
	month = "March",
	note = "process variation;NoC Links;network-on-chip;chip multiprocessor;process variability;",
	pages = "685-690",
	publisher = "EDDA",
	title = "{A} {M}ethodology for the {C}haracterization of {P}rocess {V}ariation in {N}o{C} {L}inks",
	url = "http://www.date-conference.com/proceedings/PAPERS/2010/DATE10/PDFFILES/06.3_2.PDF",
	year = 2010
}

Jose Duato, Antonio José Peña, Federico Silla, Rafael Mayo and Enrique S Quintana-Ort. RCUDA: Reducing the number of GPU-based accelerators in high performance clusters. In High Performance Computing and Simulation (HPCS), 2010 International Conference on. 2010, 224 - 231. URL BibTeX

@conference{20103913258676,
	author = "Duato, Jose and Pe{\~n}a, Antonio Jos{\'e} and Silla, Federico and Rafael Mayo and Enrique S. Quintana-Ort",
	abstract = "The increasing computing requirements for GPUs (Graphics Processing Units) have favoured the design and marketing of commodity devices that nowadays can also be used to accelerate general purpose computing. Therefore, future high performance clusters intended for HPC (High Performance Computing) will likely include such devices. However, high-end GPU-based accelerators used in HPC feature a considerable energy consumption, so that attaching a GPU to every node of a cluster has a strong impact on its overall power consumption. In this paper we detail a framework that enables remote GPU acceleration in HPC clusters, thus allowing a reduction in the number of accelerators installed in the cluster. This leads to energy, acquisition, maintenance, and space savings. ©2010 IEEE.",
	address = "Caen, France",
	booktitle = "High Performance Computing and Simulation (HPCS), 2010 International Conference on",
	journal = "Proceedings of the 2010 International Conference on High Performance Computing and Simulation, HPCS 2010",
	key = "Energy conservation",
	keywords = "Energy utilization;Program processors;",
	note = "Clusters;CUDA;Energy saving;High performance computing;Virtualizations;",
	pages = "224 - 231",
	title = "{RCUDA}: {R}educing the number of {GPU}-based accelerators in high performance clusters",
	url = "http://dx.doi.org/10.1109/HPCS.2010.5547126",
	year = 2010
}

Jose Duato, Francisco D Igual, Rafael Mayo, Antonio José Peña, Enrique S Quintana-Orti and Federico Silla. An efficient implementation of GPU virtualization in high performance clusters. In Euro-Par 2009 – Parallel Processing Workshops 6043 LNCS. 2010, 385 - 394. URL BibTeX

@conference{20102913080626,
	author = "Duato, Jose and Francisco D. Igual and Rafael Mayo and Pe{\~n}a, Antonio Jos{\'e} and Enrique S. Quintana-Orti and Silla, Federico",
	abstract = "Current high performance clusters are equipped with high bandwidth/low latency networks, lots of processors and nodes, very fast storage systems, etc. However, due to economical and/or power related constraints, in general it is not feasible to provide an accelerating co-processor -such as a graphics processor (GPU)- per node. To overcome this, in this paper we present a GPU virtualization middleware, which makes remote CUDA-compatible GPUs available to all the cluster nodes. The software is implemented on top of the sockets application programming interface, ensuring portability over commodity networks, but it can also be easily adapted to high performance networks. © 2010 Springer-Verlag.",
	address = "Delft, Netherlands",
	booktitle = "Euro-Par 2009 – Parallel Processing Workshops",
	issn = "0302-9743",
	journal = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
	key = "Data storage equipment",
	keywords = "Application programming interfaces;Computer graphics equipment;Computer software portability;Middleware;Nanotechnology;Program processors;",
	note = "Cluster nodes;Co-processors;Efficient implementation;Graphics processor;High performance cluster;High performance computing;High performance networks;Storage systems;Virtualizations;",
	pages = "385 - 394",
	title = "{A}n efficient implementation of {GPU} virtualization in high performance clusters",
	url = "http://dx.doi.org/10.1007/978-3-642-14122-5_44",
	volume = "6043 LNCS",
	year = 2010
}

Samuel Rodrigo, Carles Hernández, Jose Flich, Federico Silla, Jose Duato, S Medardoni, D Bertozzi, D Dai and . Yield-oriented evaluation methodology of network-on-chip routing implementations. In System-on-Chip, 2009. SOC 2009. International Symposium on. 2009, 100 -105. URL, DOI BibTeX

@conference{5335667,
	author = "Rodrigo, Samuel and Hern{\'a}ndez, Carles and Flich, Jose and Silla, Federico and Duato, Jose and S. Medardoni and D. Bertozzi and D. Dai and ,",
	abstract = "Network-on-Chip technology is gaining wide popularity for the interconnection of an increasing number of processor cores on the same silicon die. However, growing process variations cause interconnect malfunction or prevent the network from working at the intended frequency, directly impacting yield and manufacturing cost. Topology agnostic routing algorithms have the potential to tolerate process variations without degrading performance. We propose a three step methodology for evaluating routing algorithms in their ability to deal with variability. Using yield enhancement and operation speed preservation as the criteria, we demonstrate how this methodology can be used to select the best design choice among several plausible combinations of routing algorithms and implementations. Also, we show how an efficient table-less routing implementation can be used to minimise the impact of variability on manufacturing and operating frequency.",
	booktitle = "System-on-Chip, 2009. SOC 2009. International Symposium on",
	doi = "10.1109/SOCC.2009.5335667",
	keywords = "Si;interconnect malfunction;network-on-chip routing;processor core interconnection;silicon die;yield enhancement;yield operation;yield oriented evaluation;integrated circuit interconnections;integrated circuit yield;microprocessor chips;network-on-chip;si",
	month = "oct.",
	pages = "100 -105",
	title = "{Y}ield-oriented evaluation methodology of network-on-chip routing implementations",
	url = "http://dx.doi.org/10.1109/SOCC.2009.5335667",
	year = 2009
}

Carles Hernández, Federico Silla, Vicente Santonja and Jose Duato. A new mechanism to deal with process variability in NoC links. In IPDPS 2009 - Proceedings of the 2009 IEEE International Parallel and Distributed Processing Symposium. 2009, IEEE Computer Societ. URL BibTeX

@conference{20094812508592,
	author = "Hern{\'a}ndez, Carles and Silla, Federico and Santonja, Vicente and Duato, Jose",
	abstract = "Associated with the ever growing integration scale of VLSI technologies is the increase in process variability, which makes silicon devices to become less predictable. In the context of network-on-chip (NoC), this variability affects the maximum frequency that could be sustained by each wire of the link that interconnects two cores in a CMP system. Reducing the clock frequency so that all wires can properly work is a trivial solution but, as variability increases, this approach causes an unacceptable performance penalty. In this paper, we propose a new technique to deal with the effects of variability on the links of the NoC that interconnects cores in a CMP system. This technique, called Phit Reduction (PR), retrieves most of the bandwidth still available in links containing wires that are not able to operate at the designed operating frequency. More precisely, our mechanism discards these slow wires and uses all the wires that can work at the design frequency. Two implementations are presented: Local Phit Reduction (LPR), oriented to fabrication processes with very high variability, which requires more hardware but provides higher performance; and Global Phit Reduction (GPR), that requires less additional hardware but is not able to extract all the available bandwidth. The performance evaluation presented in the paper confirms that LPR obtains good results both for low and high variability scenarios. Moreover, in most of our experiments LPR practically achieves the same performance than the ideal network. On the other hand, GPR is appropriate for systems where whithin-die variations are expected to be low. © 2009 IEEE.",
	address = "Rome, Italy",
	booktitle = "IPDPS 2009 - Proceedings of the 2009 IEEE International Parallel and Distributed Processing Symposium",
	journal = "IPDPS 2009 - Proceedings of the 2009 IEEE International Parallel and Distributed Processing Symposium",
	key = "Wire",
	keywords = "Bandwidth;Distributed parameter networks;Electric network topology;Machine design;Nanotechnology;Radar antennas;",
	note = "Available bandwidth;Clock frequency;Design frequencies;Fabrication process;High variability;Ideal network;In-process;Maximum frequency;Network on chip;New mechanisms;Operating frequency;Performance evaluation;Performance penalties;Process Variability;Silicon devices;Trivial solutions;VLSI technology;",
	pages = "IEEE Computer Societ",
	title = "{A} new mechanism to deal with process variability in {N}o{C} links",
	url = "http://dx.doi.org/10.1109/IPDPS.2009.5161048",
	year = 2009
}

Héctor Montaner, Vicente Santonja, Federico Silla and Jose Duato. Network reconfiguration suitability for scientific applications. In Parallel Processing, 2008. ICPP '08. 37th International Conference on. 2008, 312 - 319. URL, DOI BibTeX

@conference{10207626,
	author = "Montaner, H{\'e}ctor and Santonja, Vicente and Silla, Federico and Duato, Jose",
	abstract = "This paper analyzes the communication pattern of several scientific applications and how they can make profit of network reconfiguration in order to adapt network topology to the communication needs so that total execution time is reduced. By using an analysis methodology based on real application executions, we study the variation of the required communication bandwidth with time and also the global interprocedural communication patterns. Results show that required bandwidth between each pair of processes does not significantly fluctuates, leading to a constant use of the links and therefore discouraging dynamic reconfigurations of the network during execution time. Nevertheless, the group of busy links changes with each application showing a different communication graph for each of them. Thus, execution time may be accelerated by using an ad-hoc topology, that is, reconfiguring the network before the execution of the application in order to adapt it to the application needs.",
	address = "Piscataway, NJ, USA",
	booktitle = "Parallel Processing, 2008. ICPP '08. 37th International Conference on",
	doi = "10.1109/ICPP.2008.58",
	journal = "2008 37th International Conference on Parallel Processing (ICPP)",
	keywords = "ad hoc networks;application program interfaces;message passing;natural sciences computing;telecommunication network topology;",
	note = "network reconfiguration suitability;scientific applications;network topology;global interprocedural communication patterns;communication graph;ad-hoc topology;message passing interface;",
	pages = "312 - 319",
	title = "{N}etwork reconfiguration suitability for scientific applications",
	url = "http://dx.doi.org/10.1109/ICPP.2008.58",
	year = 2008
}

Juan Manuel Orduna, Federico Silla and Jose Duato. On the development of a communication-aware task mapping technique. Journal of Systems Architecture 50(4):207 - 220, 2004. URL BibTeX

@article{2004178128206,
	author = "Juan Manuel Orduna and Silla, Federico and Duato, Jose",
	abstract = "Clusters have become a very cost-effective platform for high-performance computing. In these systems, although currently existing networks actually provide enough bandwidth for the existing applications and workstations, the trend is towards the interconnection network becoming the system bottleneck. Therefore, in the future, scheduling strategies will have to take into account the communication requirements of the applications and the communication bandwidth that the network can offer. One of the key issues in these strategies is the task mapping technique used when the network becomes the system bottleneck. In this paper, we propose a communication-aware mapping technique that tries to match as well as possible the existing network resources to the communication requirements of the applications running on the system. Also, we evaluate the mapping technique using real MPI application traces with timestamps. Evaluation results show that the use of the proposed mapping technique better exploits the available network bandwidth, improving load balancing and increasing the throughput that can be delivered by the network. Therefore, the proposed technique can be used in the design of communication-aware scheduling strategies for those situations where the communication requirements lead the network bandwidth to become the system performance bottleneck. © 2003 Elsevier B.V. All rights reserved.",
	issn = 13837621,
	journal = "Journal of Systems Architecture",
	key = "Interconnection networks",
	keywords = "Bandwidth;Computational complexity;Computer systems;Cost effectiveness;Evaluation;Mapping;Problem solving;Program processors;Scheduling;",
	note = "Cluster computing;Task scheduling;",
	number = 4,
	pages = "207 - 220",
	title = "{O}n the development of a communication-aware task mapping technique",
	url = "http://dx.doi.org/10.1016/j.sysarc.2003.09.002",
	volume = 50,
	year = 2004
}

R Garcia, Jose Duato and Federico Silla. LSOM: A Link State protocol Over MAC addresses for metropolitan backbones using Optical Ethernet switches. 2003, 315 - 21. URL BibTeX

@conference{7659346,
	author = "R. Garcia and Duato, Jose and Silla, Federico",
	abstract = {This paper presents a new protocol named "Link State Over MAC" (LSOM) for Optical Ethernet switches to allow the use of active loop topologies, like meshes, in Metropolitan Area Networks (MAN) or even Wide Area Networks (WAN) backbone. In this respect, LSOM is an alternative to a ring topology as proposed in draft IEEE 802.17 Resilient Packet Ring (RPR) or a tree topology using IEEE802. 1D Rapid Spanning Tree Protocol (RSTP). LSOM provides higher scalability and is able to achieve better bandwidth utilization and lower latency than RSTP and RPR. Simulation results for 4-node and 9-node topologies show that LSOM can improve throughput over RPR by a factor of up to 1.7. Furthermore, full freedom to choose any MAN active topology allows an effective use of the available dark fiber resources},
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings Second IEEE International Symposium on Network Computing and Applications. NCA 2003",
	keywords = "metropolitan area networks;protocols;SONET;",
	note = "Metropolitan Area Networks;protocol;Link State Over MAC;LSOM;Optical Ethernet switches;active loop topologies;scalability;bandwidth utilization;latency;",
	pages = "315 - 21",
	title = "{LSOM}: {A} {L}ink {S}tate protocol {O}ver {MAC} addresses for metropolitan backbones using {O}ptical {E}thernet switches",
	url = "http://dx.doi.org/10.1109/NCA.2003.1201170",
	year = 2003
}

J M Orduna, Federico Silla and Jose Duato. A clustering method for modeling the communication requirements of message-passing applications. Computing and Informatics 21(1):1 - 16, 2002. BibTeX

@article{7407405,
	author = "J.M. Orduna and Silla, Federico and Duato, Jose",
	abstract = "Clusters have become a very cost-effective platform for high-performance computing. Usually these systems become heterogeneous as they grow, due to their incremental capabilities. Many research activities have focused on the problem of task scheduling in heterogeneous systems from the computational point of view. However, an ideal scheduling strategy would also take into account the communication requirements of the applications and the communication bandwidth available in the network. One of the key issues in this strategy is the measurement of the communication requirements for each application. We propose a clustering-based method to characterize the communications between processes generated by message-passing applications. This technique provides a model consisting of several partitions of the processes generated by the application. Also, we propose a criterion to measure the quality of the obtained partitions. This approach can be used when a given application is repeatedly executed with different input data. Results show that the proposed method can provide a partition with the highest ratio between the intracluster and the intercluster required communication bandwidth. This partition can be used to map groups of processes to processors in the heterogeneous system",
	address = "Slovakia",
	issn = "0232-0274",
	journal = "Computing and Informatics",
	keywords = "message passing;performance evaluation;resource allocation;scheduling;workstation clusters;",
	note = "clustering method;communication requirements;message-passing applications;cost-effective;high-performance computing;task scheduling;heterogeneous systems;interconnection networks;cluster computing;communication bandwidth;intracluster;intercluster;",
	number = 1,
	pages = "1 - 16",
	title = "{A} clustering method for modeling the communication requirements of message-passing applications",
	volume = 21,
	year = 2002
}

J M Orduna, Federico Silla and Jose Duato. Towards a communication-aware task scheduling strategy for heterogeneous systems. Computing and Informatics 20(3):245 - 67, 2001. BibTeX

@article{7109983,
	author = "J.M. Orduna and Silla, Federico and Duato, Jose",
	abstract = "Many research activities have focused on the problem of task scheduling in heterogeneous systems from the computational point of view. However, a scheduling strategy should also take into account the communication requirements of the applications and the communication bandwidth offered by the network. Towards this end, we first propose a model of communication cost between network nodes. This model can be used to properly characterize the existing network resources. Second, we propose a criterion to measure the suitability of each allocation of network resources to each parallel application, according to the communication requirements. Third, we propose a scheduling technique based exclusively on this criterion that provides a near-optimal mapping of processes to processors according to the communication requirements. Evaluation results show that the use of this scheduling technique fully exploits the available network bandwidth, greatly improving network performance. Therefore, the proposed scheduling technique can be used in the design of communication-aware scheduling strategies for those situations where the communication requirements are the system performance bottleneck",
	address = "Slovakia",
	issn = "0232-0274",
	journal = "Computing and Informatics",
	keywords = "directed graphs;performance evaluation;processor scheduling;resource allocation;trees (mathematics);workstation clusters;",
	note = "communication-aware task scheduling strategy;heterogeneous systems;communication cost;network nodes;network resources;parallel application;near-optimal mapping;available network bandwidth;network performance;performance bottleneck;interconnection networks;cluster computing;",
	number = 3,
	pages = "245 - 67",
	title = "{T}owards a communication-aware task scheduling strategy for heterogeneous systems",
	volume = 20,
	year = 2001
}

Xavier Molero, Federico Silla, Vicente Santonja and Jose Duato. On the scalability of topologies for storage area networks in building environments. 2001, 332 - 5. URL BibTeX

@conference{7114065,
	author = "Molero, Xavier and Silla, Federico and Santonja, Vicente and Duato, Jose",
	abstract = "Nowadays, the fast growth of data intensive applications is changing the way storage is devised. The traditional server-to-disk approach is being replaced by storage area networks (SANs), which are a separate network for storage, isolated from the messaging network and optimized for the movement of data between servers and storage devices (usually disks). We analyze the performance and cost scalability of a family of network topologies devised to be used in building environments. Performance simulation results combined with cost estimations have revealed that slight modifications in network topology can affect the overall scalability. In particular wraparound links connecting the lowest and highest floors in the building significantly affect the scalability of the network. Anyway, the use of this kind of links by itself does not provide the best solution. It is also necessary to have a good interconnection pattern in the backbone",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings IEEE International Symposium on Network Computing and Applications. NCA 2001",
	keywords = "digital storage;local area networks;network topology;telecommunication network routing;",
	note = "storage area networks;building environments;data intensive applications;servers;storage devices;cost scalability;performance simulation;cost estimations;network topology;wraparound links;interconnection pattern;backbone;",
	pages = "332 - 5",
	title = "{O}n the scalability of topologies for storage area networks in building environments",
	url = "http://dx.doi.org/10.1109/NCA.2001.962549",
	year = 2001
}

Xavier Molero, Federico Silla, Vicente Santonja and Jose Duato. On the switch architecture for fibre channel storage area networks. 2001, 484 - 491. URL BibTeX

@conference{2001416673902,
	author = "Molero, Xavier and Silla, Federico and Santonja, Vicente and Duato, Jose",
	abstract = "The fast growth of data intensive applications has caused a change in the traditional storage model. The server-to-disk approach is being replaced by storage area networks (SANs), which enable storage to be externalized from servers, thus allowing storage devices to be shared among multiple servers. Nowadays, the majority of SANs use Fibre Channel. The standard for Fibre Channel defines several issues related to the switch interface, but does not make any suggestion about the internal switch architecture to be implemented by manufacturers. In this paper we analyze the key architectural switch characteristics for building Fibre Channel storage area networks. To do so, our starting point is the performance analysis of two different switch architectures, identifying their strongest and weakest points, and thus taking advantage of the best features from both of them. After this first analysis, we introduce several other features in the switch, concluding with a proposed architecture that doubles network throughput while reducing response delay.",
	address = "Kyongju, Korea, Republic of",
	journal = "Proceedings of the Internatoinal Conference on Parallel and Distributed Systems - ICPADS",
	key = "Client server computer systems",
	keywords = "Computer architecture;Computer networks;Data storage equipment;Network protocols;",
	note = "Fiber channel;Storage area network;",
	pages = "484 - 491",
	title = "{O}n the switch architecture for fibre channel storage area networks",
	url = "http://dx.doi.org/10.1109/ICPADS.2001.934857",
	year = 2001
}

Xavier Molero, Federico Silla, Vicente Santonja and Jose Duato. On the impact of message packetization in networks of workstations with irregular topology. 2001, 3 - 10. URL BibTeX

@conference{6867161,
	author = "Molero, Xavier and Silla, Federico and Santonja, Vicente and Duato, Jose",
	abstract = "Networks of workstations (NOWs) are becoming an increasingly popular alternative to parallel computers for those applications with high needs of resources such as memory capacity and input/output storage space, and also for small scale parallel computing. Usually, the software messaging layers in these systems become a bottleneck due to the overhead they introduce. Some proposals like FM and BIP considerably reduce this overhead by splitting long messages into several packets. These proposals have been shown to improve communication performance. However, the effect of message packetization on the network interconnects has not been analyzed yet. In this paper we examine the effect of message packetization from the point of view of the interconnection network in the context of bimodal traffic. Two different routing algorithms have been considered: up*/down* and minimal adaptive routing. Our study shows that when the up */down* routing algorithm is used, message packetization dramatically increases latency and reduces throughput for both long and short messages. On the other hand, if minimal adaptive routing is used, short messages could benefit from message packetization, but at the cost of increasing latency for long messages. In any case, network throughput is considerably reduced",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings Ninth Euromicro Workshop on Parallel and Distributed Processing",
	keywords = "multiprocessor interconnection networks;network routing;performance evaluation;workstation clusters;",
	note = "message packetization;networks of workstations;irregular topology;resources;memory capacity;input/output storage space;software messaging layers;interconnection network;bimodal traffic;routing algorithms;minimal adaptive routing;latency;",
	pages = "3 - 10",
	title = "{O}n the impact of message packetization in networks of workstations with irregular topology",
	url = "http://dx.doi.org/10.1109/EMPDP.2001.904960",
	year = 2001
}

Xavier Molero, Federico Silla, Vicente Santonja and Jose Duato. Improving network performance by efficiently dealing with short control messages in fibre channel SANs. 2001, 901 - 10. BibTeX

@conference{7219763,
	author = "Molero, Xavier and Silla, Federico and Santonja, Vicente and Duato, Jose",
	abstract = "Traffic in a storage area networks (SANs) is bimodal, composed of long messages carrying several KBytes of data, and short messages containing control information (I/O commands). From the network point of view, latency of control messages is highly affected by the transmission of data messages, due to their length. As a consequence, it is necessary to establish management policies that benefit the transmission of short control messages, thus reducing the overall response time for I/O operations and increasing network throughput. We propose several strategies for dealing with short control messages and analyze their impact on the performance of storage area networks. This analysis is carried out for a fully adaptive routing algorithm in the context of two different network topology environments: buildings and departments. Simulation results show that both I/O response time and network throughput may be improved when efficiently managing control messages",
	address = "Berlin, Germany",
	journal = "Euro-Par 2001 Parallel Processing. 7th International Euro-Par Conference. Proceedings (Lecture Notes in Computer Science Vol.2150)",
	keywords = "digital storage;local area networks;",
	note = "network performance;short control messages;fibre channel SANs;storage area networks;bimodal traffic;latency;data messages;management policies;response time;I/O operations;network topology environments;",
	pages = "901 - 10",
	title = "{I}mproving network performance by efficiently dealing with short control messages in fibre channel {SAN}s",
	year = 2001
}

Xavier Molero, Federico Silla, Vicente Santonja and Jose Duato. A tool for the design and evaluation of fibre channel storage area networks. 2001, 133 - 140. URL BibTeX

@conference{2001296584391,
author = "Molero, Xavier and Silla, Federico and Santonja, Vicente and Duato, Jose",
abstract = "The fast growth of data intensive applications has caused a change in the traditional storage model. The server-to-disk approach, usually implemented with SCSI buses, is being replaced by storage area networks (SANs), which enable storage to be externalized from servers, thus allowing storage devices to be shared among multiple servers. A SAN is a separate network for storage, isolated from the messaging network and optimized for the movement of data between servers and storage devices. Nowadays, most of current SANs use Fibre Channel as the technology to move data between servers and storage devices. In order to design and evaluate the performance of these systems it is necessary to have adequate tools. Usually, performance evaluation may be based on analytical modeling or simulation. Each of them differs in their scope and applicability. However, the simulation modeling technique offers more freedom, flexibility, and accuracy than analytical methods. Thus, when evaluating the performance of SANs, simulation modeling should be used. In this paper we present the main capabilities of a simulator for Fibre Channel SANs, focusing on its input parameters and output variables. We also show several simple examples of performance measurements that can be obtained using this tool.",
address = "Seattle, WA, United states",
issn = 02724715,
journal = "Proceedings of the IEEE Annual Simulation Symposium",
key = "Data storage equipment",
keywords = "Client server computer systems;Communication channels;Computer simulation;Local area networks;Mathematical models;Optimization;",
note = "Fiber channel storage area networks;Multiple servers;",
pages = "133 - 140",
title = "{A} tool for the design and evaluation of fibre channel storage area networks",
url = "http://dx.doi.org/10.1109/SIMSYM.2001.922125",
year = 2001
}

J M Orduna, Federico Silla and Jose Duato. A new task mapping technique for communication-aware scheduling strategies. 2001, 349 - 54. URL BibTeX

@conference{7075370,
	author = "J.M. Orduna and Silla, Federico and Duato, Jose",
	abstract = "Clusters have become a very cost-effective platform for high-performance computing. In these systems, the trend is towards the interconnection network becoming the system bottleneck. Therefore, in the future, scheduling strategies will have to take into account the communication requirements of the applications and the communication bandwidth that the network can offer. One of the key issues in these strategies is the task mapping technique used when the network becomes the system bottleneck. In this paper, we propose an enhanced version of a previously proposed mapping technique that takes into account not only the existing network resources, but also the traffic generated by the applications. Also, we evaluate the mapping technique using real MPI application traces with timestamps. Evaluation results show that the rise of the new mapping technique fully exploits the available network bandwidth, improving load balancing and increasing the throughput that can be delivered by the network",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings International Conference on Parallel Processing Workshops",
	keywords = "multiprocessor interconnection networks;performance evaluation;processor scheduling;workstation clusters;",
	note = "interconnection network;scheduling;clusters;communication-aware scheduling;mapping technique;MPI application traces;task mapping;",
	pages = "349 - 54",
	title = "{A} new task mapping technique for communication-aware scheduling strategies",
	url = "http://dx.doi.org/10.1109/ICPPW.2001.951971",
	year = 2001
}

Jose Duato, Antonio Robles, Federico Silla and R Beivide. A Comparison of Router Architectures for Virtual Cut-Through and Wormhole Switching in a NOW Environment. Journal of Parallel and Distributed Computing 61(2):224 - 253, 2001. URL BibTeX

@article{2004488488316,
	author = "Duato, Jose and Robles, Antonio and Silla, Federico and R. Beivide",
	abstract = "Most multicomputer interconnection networks use wormhole switching, leading to fast and compact routers. Current routers incorporate virtual channels and even fully adaptive routing. Networks of workstations (NOWs) inherited multicomputer technology. Most commercial routers designed for NOWs implement wormhole switching. However, wormhole switching is not well suited for NOWs. The long wires required in this environment lead to large buffers to prevent buffer overflow during flow control signaling. Moreover, wire length is limited by buffer size. Virtual cut-through (VCT) achieves a higher throughput than wormhole switching. However, buffer requirements and packetizing overhead prevented its widespread use in multicomputers. Nevertheless, wormhole and VCT switching require similar buffer capacity in NOWs. Moreover, some messaging layers such as Illinois Fast Messages (FM) and BIP split messages into packets for increased performance. Therefore, the traditional disadvantages of VCT switching disappear in NOWs. In this paper, we show that VCT routers can be simpler than wormhole routers, while still achieving the advantages of using virtual channels and adaptive routing. We also propose a fully adaptive routing algorithm for VCT switching in a NOW environment. Moreover, we show that VCT routers outperform wormhole routers in a NOW environment at a lower cost. Also, VCT routers require buffer capacity independent of wire length, making them suitable for networks of workstations. © 2001 Academic Press.",
	address = "Orlando, United States",
	issn = 07437315,
	journal = "Journal of Parallel and Distributed Computing",
	number = 2,
	pages = "224 - 253",
	title = "{A} {C}omparison of {R}outer {A}rchitectures for {V}irtual {C}ut-{T}hrough and {W}ormhole {S}witching in a {NOW} {E}nvironment",
	url = "http://dx.doi.org/10.1006/jpdc.2000.1679",
	volume = 61,
	year = 2001
}

Juan Carlos Martinez, Federico Silla, Pedro Lopez and Jose Duato. On the influence of the selection function on the performance of networks of workstations. 2000, 292 - 9. BibTeX

@conference{6977556,
	author = "Martinez, Juan Carlos and Silla, Federico and Lopez, Pedro and Duato, Jose",
	abstract = "Previous research has pointed out the influence of adaptive routing on the performance improvement of interconnection networks for clusters of workstations. One of the design issues of adaptive routing algorithms is the selection function, which selects the output channel among all the available choices. We analyze in detail several selection functions in order to evaluate their influence on network performance. Simulation results show that network throughput may be increased up to 10%. When the network is close to saturation, improvements in latency up to 40% may be achieved",
	address = "Berlin, Germany",
	journal = "High Performance Computing. Third International Symposium, ISHPC 2000. Proceedings (Lecture Notes in Computer Science Vol.1940)",
	keywords = "delays;multiprocessor interconnection networks;network routing;network topology;performance evaluation;workstation clusters;",
	note = "selection function;networks of workstations;interconnection networks;workstation clusters;adaptive routing algorithms;performance evaluation;network throughput;latency;",
	pages = "292 - 9",
	title = "{O}n the influence of the selection function on the performance of networks of workstations",
	year = 2000
}

Federico Silla and Jose Duato. On the use of virtual channels in networks of workstations with irregular topology. IEEE Transactions on Parallel and Distributed Systems 11(8):813 - 828, 2000. URL BibTeX

@article{2000515393317,
	author = "Silla, Federico and Duato, Jose",
	abstract = "Networks of workstations are becoming increasingly popular as a cost-effective alternative to parallel computers. Typically, these networks connect workstations using irregular topologies, providing the wiring flexibility, scalability, and incremental expansion capability required in this environment. Recently, we proposed two methodologies for the design of adaptive routing algorithms for networks with irregular topology, as well as fully adaptive routing algorithms for these networks. These algorithms increase throughput considerably with respect to previously existing ones, but require the use of at least two virtual channels. In this paper, we propose a very efficient flow control protocol to support virtual channels when link wires are very long and/or have different lengths. This flow control protocol relies on the use of channel pipelining and control flits. Control traffic is minimized by assigning physical bandwidth to virtual channels until the corresponding message blocks or it is completely transmitted. Simulation results show that this flow control protocol performs as efficiently as an ideal network with short wires and flit-by-flit multiplexing. The effect of additional virtual channels per physical channel has also been studied, revealing that the optimal number of virtual channels varies with network size. The use of virtual channel priorities is also analyzed. The proposed flow control protocol may increase short message latency, due to long messages monopolizing channels and hindering the progress of short messages. Therefore, we have analyzed the impact of limiting the number of flits (block size) that a virtual channel may forward once it gets the link. Simulation results show that limiting the maximum block size causes the overall network performance to decrease.",
	address = "Los Alamitos, CA, United States",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Network protocols",
	keywords = "Adaptive algorithms;Bandwidth;Communication channels;Computer simulation;Computer workstations;Congestion control;Multiplexing;Pipeline processing systems;Telecommunication traffic;",
	note = "Adaptive routing algorithms;Block multiplexing;Channel pipelining;Virtual channels;Wormhole switching;",
	number = 8,
	pages = "813 - 828",
	title = "{O}n the use of virtual channels in networks of workstations with irregular topology",
	url = "http://dx.doi.org/10.1109/71.877939",
	volume = 11,
	year = 2000
}

Xavier Molero, Federico Silla, Vicente Santonja and Jose Duato. Performance analysis of storage area networks using high-speed LAN interconnects. 2000, 474 - 8. URL BibTeX

@conference{6783964,
	author = "Molero, Xavier and Silla, Federico and Santonja, Vicente and Duato, Jose",
	abstract = "Storage area networks (SANs) are an emerging data communications platform which interconnects servers an storage devices (such as disks, disk arrays, and tape drives) to create a pool of storage that users can access directly. SANs eliminate the bandwidth bottlenecks and scalability limitations imposed by previous SCSI bus-based architectures and LAN connections between servers and the stored data. This networking approach reports benefits such as computer clustering, topological flexibility, fault tolerance, high availability, and remote management. The prominent technology for implementing SANs is the fibre channel, due to the suitability of this technology for storage networking. Other technologies for high performance interconnects have also been developed. These interconnects provide switch-based networks with links transferring data at more than 1 Gigabit per second, being mainly used in the LAN environments. We analyze whether these high-speed LAN technologies could also be an interesting alternative to storage networking. We perform this analysis using real-world I/O traces. The main conclusion from our study is that most of the messages present the base network latency, meaning that the network is not heavily loaded. Moreover the response time is, in general, acceptable, being dominated by the time disks need to process the requests",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings IEEE International Conference on Networks 2000 (ICON 2000). Networking Trends and Challenges in the New Millennium",
	keywords = "data communication;digital storage;disc storage;fault tolerance;LAN interconnection;network servers;network topology;performance evaluation;",
	note = "storage area networks;high-speed LAN interconnects;performance analysis;data communications platform;servers interconnection;storage devices;disks;disk arrays;tape drives;computer clustering;topological flexibility;fault tolerance;high availability;fibre channel;switch-based networks;real-world I/O traces;network latency;response time;remote management;",
	pages = "474 - 8",
	title = "{P}erformance analysis of storage area networks using high-speed {LAN} interconnects",
	url = "http://dx.doi.org/10.1109/ICON.2000.875833",
	year = 2000
}

Xavier Molero, Federico Silla, Vicente Santonja and Jose Duato. Performance sensitivity of routing algorithms to failures in networks of workstations. 2000, 230 - 42. BibTeX

@conference{6977549,
	author = "Molero, Xavier and Silla, Federico and Santonja, Vicente and Duato, Jose",
	abstract = "Networks of workstations (NOW) are becoming an increasingly popular alternative to parallel computers for those applications with high needs of resources such as memory capacity and input/output storage space, and also for small-scale parallel computing. Although the mean time between failures (MTBF) for individual links and switches in a NOW is very high, the probability of a failure occurrence dramatically increases as the network size becomes larger. Moreover, there are external factors, such as accidental link disconnections, that also can affect the overall NOW reliability. Until the faulty element is replaced, the NOW is functioning in a degraded mode. Thus, it becomes necessary to quantify how much the global NOW performance is reduced during the time the system remains in this state. We analyze the performance degradation of networks of workstations when failures in links or switches occur. Because the routing algorithm is a key issue in the design of a NOW, we quantify the sensitivity to failures of two routing algorithms: up*/down* and minimal adaptive routing algorithms. Simulation results show that, in general, up*/down* routing is highly robust to failures. On the other hand, the minimal adaptive routing algorithm presents a better performance, even in the presence of failures, but at the expense of a larger sensitivity",
	address = "Berlin, Germany",
	journal = "High Performance Computing. Third International Symposium, ISHPC 2000. Proceedings (Lecture Notes in Computer Science Vol.1940)",
	keywords = "computer network reliability;network routing;performance evaluation;probability;workstation clusters;",
	note = "performance sensitivity;networks of workstations;NOW;small-scale parallel computing;mean time between failures;MTBF;failure probability;reliability;performance degradation;up*/down* routing algorithm;minimal adaptive routing algorithm;",
	pages = "230 - 42",
	title = "{P}erformance sensitivity of routing algorithms to failures in networks of workstations",
	year = 2000
}

Xavier Molero, Federico Silla, Vicente Santonja and Jose Duato. On the effect of link failures in fibre channel storage area networks. 2000, 102 - 11. URL BibTeX

@conference{6832473,
	author = "Molero, Xavier and Silla, Federico and Santonja, Vicente and Duato, Jose",
	abstract = "The fast growth of data intensive applications has caused a change in the traditional storage model. The server-to-disk approach is being replaced by storage area networks (SANs), which enable storage to be externalized from servers, thus allowing storage devices to be shared among multiple servers. The prominent technology for implementing SANs is Fibre Channel, due to its suitability for storage networking. Although the probability of a link failure for individual links in a SAN is very low, this probability dramatically increases as the network size becomes larger. Moreover, there are external factors, such as accidental link disconnections, that also can affect the overall SAN reliability. Until the faulty element is replaced, the SAN is functioning in a degraded mode. In this paper we analyze by simulation the performance degradation of Fibre Channel storage area networks when failures in links occur, quantifying how much the global SAN performance is reduced during the time the system remains in the degraded state. We perform this analysis by using both synthetic and real I/O traffic. Simulation results show that performance degradation mainly depends on the routing algorithm and the switch architecture used",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings International Symposium on Parallel Architectures, Algorithms and Networks. I-SPAN 2000",
	keywords = "optical fibre LAN;optical storage;performance evaluation;",
	note = "link failures;fibre channel storage area networks;storage model;server-to-disk approach;multiple servers;link failure;network size;performance degradation;real I/O traffic;routing algorithm;switch architecture;",
	pages = "102 - 11",
	title = "{O}n the effect of link failures in fibre channel storage area networks",
	url = "http://dx.doi.org/10.1109/ISPAN.2000.900269",
	year = 2000
}

Xavier Molero, Federico Silla, Vicente Santonja and Jose Duato. Modeling and simulation of storage area networks. 2000, 307 - 14. URL BibTeX

@conference{6735495,
	author = "Molero, Xavier and Silla, Federico and Santonja, Vicente and Duato, Jose",
	abstract = "Storage area networks (SANs) are an emerging data communications platform which interconnects servers and storage devices (such as disks, disk arrays, and tape drives) to create a pool of storage that users can access directly. This networking approach reports benefits such as computer clustering, topological flexibility, fault tolerance, high availability, and remote management. In order to evaluate the performance of these systems it is necessary to have the adequate tools. Usually, performance evaluation may be based on analytical modeling or simulation. Each of them differs in their scope and applicability. However the simulation modeling technique offers more freedom, flexibility, and accuracy than the analytical methods. Thus, when evaluating the performance of SANs, simulation modeling should be used. In this paper the issues involved in the modeling and design of a very flexible and easy to use SAN simulator are presented. This tool is able to consider among others, both real-world I/O traces and synthetic I/O traffic, message packetization, faults in links and switches, virtual channels, different routing algorithms, etc. We describe its main internal organization, the basic modeling mechanisms the simulator is based on, the main input parameters and output performance variables. Also, the analysis of preliminary results using I/O traces is presented, showing that the storage network increases self-similarity of the traffic received by servers, latency variations are more important for control messages than for data messages, and links have a low utilization",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings 8th International Symposium on Modeling, Analysis and Simulation of Computer and Telecommunication Systems (Cat. No.PR00728)",
	keywords = "local area networks;performance evaluation;storage management;virtual machines;",
	note = "storage area networks;modeling;simulation;data communications platform;servers;storage devices;computer clustering;topological flexibility;fault tolerance;high availability;remote management;performance evaluation;real-world I/O traces;synthetic I/O traffic;message packetization;faults;virtual channel;routing algorithms;traffic self-similarity;control messages;data messages;",
	pages = "307 - 14",
	title = "{M}odeling and simulation of storage area networks",
	url = "http://dx.doi.org/10.1109/MASCOT.2000.876553",
	year = 2000
}

Xavier Molero, Federico Silla and Vicente Santonja. Modeling and simulation of a network of workstations with wormhole switching. Proceedings of the IEEE Annual Simulation Symposium, pages 299 - 306, 2000. BibTeX

@article{2000295194823,
	author = "Molero, Xavier and Silla, Federico and Santonja, Vicente",
	abstract = "Networks of workstations (NOW) are becoming a very popular alternative to parallel computers. This article presents a NOW simulator, the basic queueing models it is based on, its main internal organization, input parameters, and output performance variables. Simple examples of performance measures obtained for message fragmentation, failures in links and switches, and self-similar traffic are given.",
	address = "Washington, DC, USA",
	issn = 02724715,
	journal = "Proceedings of the IEEE Annual Simulation Symposium",
	key = "Parallel processing systems",
	keywords = "Adaptive algorithms;Computer architecture;Computer simulation;Computer workstations;Data communication systems;Evaluation;Local area networks;Performance;Switching theory;Telecommunication traffic;",
	note = "Failures in links and switches;Irregular topologies;Message fragmentation;Networks of workstations;Routing algorithms;Self similar traffic;Software Package CSIM;Wormhole switching;",
	pages = "299 - 306",
	title = "{M}odeling and simulation of a network of workstations with wormhole switching",
	year = 2000
}

Xavier Molero, Federico Silla and Vicente Santonja. Modeling and simulation of a network of workstations with wormhole switching. 2000, 299 - 306. URL BibTeX

@conference{6590026,
	author = "Molero, Xavier and Silla, Federico and Santonja, Vicente",
	abstract = "Networks of workstations (NOW) are becoming a very popular alternative to parallel computers for those applications with high needs of resources such as memory capacity processing power and input/output storage space. Typically, these networks connect workstations using irregular topologies, providing wiring flexibility, scalability, and incremental expansion capability required in this environment. In order to analyze and design these kind of systems it is necessary to have adequate tools. To address this problem, we have implemented a very flexible and easy to use NOW simulator. It is based on the one presented in (Silla, 1998; Silla and Duato, 1997; 1998) and it includes three more functionalities: it supports a technique for message fragmentation in packets, generates self-similar traffic, and also it can model networks with permanent faulted links or switches. We present this NOW simulator, the basic queueing models it is based on, its main internal organization, input parameters, output performance variables, and finally, we show several simple examples of performance measures obtained for, among others, message fragmentation, failures in links and switches, and self-similar traffic",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings 33rd Annual Simulation Symposium (SS 2000)",
	keywords = "digital simulation;packet switching;performance evaluation;telecommunication computing;telecommunication network routing;telecommunication traffic;workstation clusters;",
	note = "network of workstations;network simulation;wormhole switching;parallel computers;memory capacity;input output storage space;wiring flexibility;message fragmentation;self-similar traffic;queueing models;performance measures;",
	pages = "299 - 306",
	title = "{M}odeling and simulation of a network of workstations with wormhole switching",
	url = "http://dx.doi.org/10.1109/SIMSYM.2000.844928",
	year = 2000
}

Federico Silla and Jose Duato. High-performance routing in networks of workstations with irregular topology. IEEE Transactions on Parallel and Distributed Systems 11(7):699 - 719, 2000. URL BibTeX

@article{2000465351371,
author = "Silla, Federico and Duato, Jose",
abstract = "Networks of workstations are rapidly emerging as a cost-effective alternative to parallel computers. Switch-based interconnects with irregular topology allow the wiring flexibility, scalability, and incremental expansion capability required in this environment. However, the irregularity also makes routing and deadlock avoidance on such systems quite complicated. In current proposals, many messages are routed following nonminimal paths, increasing latency and wasting resources. In this paper, we propose two general methodologies for the design of adaptive routing algorithms for networks with irregular topology. Routing algorithms designed according to these methodologies allow messages to follow minimal paths in most cases, reducing message latency and increasing network throughput. As an example of application, we propose two adaptive muting algorithms for AN1 (previously known as Autonet). They can be implemented either by duplicating physical channels or by splitting each physical channel into two virtual channels. In the former case, the implementation does not require a new switch design. It only requires changing the routing tables and adding links in parallel with existing ones, taking advantage of spare switch ports. In the latter case, a new switch design is required, but the network topology is not changed. Evaluation results for several different topologies and message distributions show that the new muting algorithms are able to increase throughput for random traffic by a factor of up to 4 with respect to the original up*/down* algorithm, also reducing latency significantly. For other message distributions, throughput is increased more than seven times. We also show that most of the improvement comes from the use of minimal muting.",
address = "Los Alamitos, CA, United States",
issn = 10459219,
journal = "IEEE Transactions on Parallel and Distributed Systems",
key = "Parallel processing systems",
keywords = "Adaptive algorithms;Communication channels;Computer workstations;Congestion control;Interconnection networks;Response time;Telecommunication traffic;Topology;",
note = "Adaptive routing algorithms;Wormhole switching;",
number = 7,
pages = "699 - 719",
title = "{H}igh-performance routing in networks of workstations with irregular topology",
url = "http://dx.doi.org/10.1109/71.877816",
volume = 11,
year = 2000
}

Xavier Molero, Federico Silla, F Rodriguez and Vicente Santonja. Design and implementation of a simulation tool for networks of workstations. 2000, 154 - 9. BibTeX

@conference{6804250,
	author = "Molero, Xavier and Silla, Federico and F. Rodriguez and Santonja, Vicente",
	abstract = "Networks of workstations (NOWs) are rapidly emerging as a cost-effective alternative to parallel computers. In order to evaluate their performance, it is necessary to use adequate tools. Performance evaluation may be based on several types of modeling techniques (analytical modeling, simulation modeling, prototyping). Each of them differs in their scope and applicability. However, the simulation modeling technique offers more freedom and flexibility than the other methods. Thus, when evaluating the performance of NOWs, simulation modeling is often used because it provides a convenient and reliable way for such studies. We have implemented a very flexible and easy to use NOW simulator. The authors present a brief description of the employed simulation language, the internal design of the implemented tool, along with fragments of the code for simulating the behavior of the modeled system elements",
	address = "San Diego, CA, USA",
	journal = "Proceedings of the High Performance Computing Symposium - HPC 2000",
	keywords = "discrete event simulation;performance evaluation;virtual machines;workstation clusters;",
	note = "simulation tool;networks of workstations;NOWs;modeling techniques;performance evaluation;analytical modeling;simulation modeling;prototyping;simulation language;model design;discrete simulation;hierarchical modeling;arrival generation;",
	pages = "154 - 9",
	title = "{D}esign and implementation of a simulation tool for networks of workstations",
	year = 2000
}

Federico Silla and Jose Duato. Is it worth the flexibility provided by irregular topologies in networks of workstations?. 1999, 47 - 61. BibTeX

@conference{6439234,
	author = "Silla, Federico and Duato, Jose",
	abstract = "Networks of workstations (NOWs) are becoming a cost-effective alternative for small-scale parallel computing. Usually, NOWs present an irregular topology as a consequence of the needs in a local area network. Routing algorithms used in NOWs are inherently different from those used in regular networks, mainly due to the irregular connections between switches. In these algorithms, routing is considerably restricted in order to avoid deadlocks. Recently, a general methodology for the design of adaptive routing algorithms for irregular networks has been proposed by the authors. The resulting algorithms increase the maximum achievable throughput while reducing message latency. In this paper, we study how much network performance we are losing due to the irregular topology of NOWs. We analyze the performance of the up^*/down^* routing algorithm in a 2D mesh topology and compare it with the performance achieved by the XY routing scheme in the same network, in order to answer the following two questions: 1) in a 2D mesh, which of the two routing algorithms achieves better performance?, and 2) where does the up^*/down^* routing algorithm work better, in a 2D mesh or in an irregular network? Simulation results show that the up^*/down^* routing strategy performs better in a regular network than in an irregular one. On the other hand, the XY routing algorithm considerably outperforms the up^*/down^* scheme. However, when the adaptive routing algorithm proposed by the authors is used, differences in performance are much smaller. Thus, the higher performance of a regular topology could not compensate for the loss in wiring flexibility with respect to irregular networks, or their capability of adding a single switch at any moment",
	address = "Berlin, Germany",
	journal = "Network-Based Parallel Computing. Communication, Architecture, and Applications. Third International Workshop, CANPC'99 Proceedings",
	keywords = "multiprocessor interconnection networks;network routing;network topology;performance evaluation;workstation clusters;",
	note = "NOWs;networks of workstations;irregular topology;local area network;routing;adaptive routing algorithms;2D mesh topology;performance;",
	pages = "47 - 61",
	title = "{I}s it worth the flexibility provided by irregular topologies in networks of workstations?",
	year = 1999
}

Jose Duato, Antonio Robles, Federico Silla and R Beivide. Comparison of router architectures for virtual cut-through and wormhole switching in a NOW environment. Proceedings of the International Parallel Processing Symposium, IPPS, pages 240 - 247, 1999. BibTeX

@article{1999394752205,
	author = "Duato, Jose and Robles, Antonio and Silla, Federico and R. Beivide",
	abstract = "Most commercial routers designed for networks of workstations (NOWs) implement wormhole switching. However, wormhole switching is not well suited for NOWs. The long wires required in this environment lead to large buffers to prevent buffer overflow during flow control signaling. Moreover, wire length is limited by buffer size. Virtual cut-through (VCT) achieves a higher throughput than wormhole switching. Moreover, the traditional disadvantages of VCT switching, as buffer requirements and packetizing overhead, disappear in NOWs. In this paper, we show that VCT routers can be simpler than wormhole ones, while still achieving the advantages of using virtual channels and adaptive routing. We also propose a fully adaptive routing algorithm for VCT switching in NOWs. Moreover, we show that VCT routers outperform wormhole routers in a NOW environment at a lower cost.",
	address = "San Juan",
	issn = 10637133,
	journal = "Proceedings of the International Parallel Processing Symposium, IPPS",
	key = "Pipeline processing systems",
	keywords = "Adaptive algorithms;Computer architecture;Computer workstations;Switching networks;",
	note = "Virtual cut-through (VCT);Wormhole switching;",
	pages = "240 - 247",
	title = "{C}omparison of router architectures for virtual cut-through and wormhole switching in a {NOW} environment",
	year = 1999
}

Jose Duato, Antonio Robles, Federico Silla and R Beivide. A comparison of router architectures for virtual cut-through and wormhole switching in a NOW environment. 1999, 240 - 7. URL BibTeX

@conference{6245442,
	author = "Duato, Jose and Robles, Antonio and Silla, Federico and R. Beivide",
	abstract = "Most commercial routers designed for networks of workstations (NOWs) implement wormhole switching. However wormhole switching is not well suited for NOWs. The long wires required in this environment lead to large buffers to prevent buffer overflow during flow control signaling. Moreover, wire length is limited by buffer size. Virtual cut-through (VCT) achieves a higher throughput than wormhole switching. Moreover, the traditional disadvantages of VCT switching, as buffer requirements and packetizing overhead, disappear in NOWs. In this paper, we show that VCT routers can be simpler than wormhole ones, while still achieving the advantages of using virtual channels and adaptive routing. We also propose a fully adaptive routing algorithm for VCT switching in NOWs. Moreover, we show that VCT routers outperform wormhole routers in a NOW environment at a lower cost",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings 13th International Parallel Processing Symposium and 10th Symposium on Parallel and Distributed Processing. IPPS/SPDP 1999",
	keywords = "multiprocessor interconnection networks;network routing;workstation clusters;",
	note = "router architectures;virtual cut-through;wormhole switching;NOW environment;networks of workstations;buffer requirements;packetizing overhead;VCT routers;",
	pages = "240 - 7",
	title = "{A} comparison of router architectures for virtual cut-through and wormhole switching in a {NOW} environment",
	url = "http://dx.doi.org/10.1109/IPPS.1999.760469",
	year = 1999
}

Federico Silla, Jose Duato, A Sivasubramaniam and C R Das. Virtual channel multiplexing in networks of workstations with irregular topology. 1998, 147 - 54. URL BibTeX

@conference{6129280,
	author = "Silla, Federico and Duato, Jose and A. Sivasubramaniam and C.R. Das",
	abstract = "Networks of workstations are becoming a cost-effective alternative for small-scale parallel computing. Although they may not provide the closely coupled environment of multicomputers and multiprocessors, they meet the needs of a great variety of parallel computing problems at a lower cost. However in order to achieve a high efficiency, the interconnects used to build the network of workstations must provide a very high bandwidth and low latencies, making their design a critical issue. Recently, a very efficient flow control protocol for networks of workstations has been proposed by the authors. This protocol multiplexes physical channels between several virtual channels and minimizes the use of control flits by transmitting several data flits each time a virtual channel gets the link. In this protocol, a virtual channel sends data flits until the message blocks or is completely transmitted. However it can reduce network throughput, by increasing short message latency, due to long messages monopolizing channels and hindering the progress of short messages. In this paper, we analyze the impact of limiting the number of flits (block size) that a virtual channel can send once it gets the link. We propose a new version of the previous flow control protocol that is easily, implementable on hardware. Simulation results show that limiting the maximum block size is not a good design decision, because the overall network performance decreases. Only when short message latency is crucial is it is acceptable to limit the block size",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings. Fifth International Conference on High Performance Computing (Cat. No. 98EX238)",
	keywords = "multiplexing;parallel processing;performance evaluation;protocols;workstation clusters;",
	note = "virtual channel multiplexing;workstation networks;irregular topology;small-scale parallel computing;high efficiency;interconnects;high bandwidth;low latency;flow control protocol;physical channels;minimized control flit use;data flit transmission;network throughput;simulation;network performance;short message latency;",
	pages = "147 - 54",
	title = "{V}irtual channel multiplexing in networks of workstations with irregular topology",
	url = "http://dx.doi.org/10.1109/HIPC.1998.737983",
	year = 1998
}

Federico Silla and Jose Duato. On the use of virtual channels in networks of workstations with irregular topology. 1998, 203 - 16. BibTeX

@conference{5992382,
	author = "Silla, Federico and Duato, Jose",
	abstract = "Networks of workstations are becoming increasingly popular as a cost-effective alternative to parallel computers. Typically, these networks connect processors using irregular topologies, providing the wiring flexibility, scalability and incremental expansion capability required in this environment. Recently, we proposed a design methodology as well as fully adaptive routing algorithms for irregular topologies. These algorithms increase throughput considerably with respect to previously existing ones but require the use of virtual channels. In this paper we propose a very efficient flow control mechanism to support virtual channels when link wires are very long, and/or have different lengths. This flow control mechanism relies on the use of channel pipelining and control flits. Control traffic is minimized by assigning physical bandwidth to virtual channels until the corresponding message blocks or it is completely transmitted. Simulations show that the resulting flow control protocol performs almost as efficiently as an ideal network with short wires and flit-by-flit multiplexing",
	address = "Berlin, Germany",
	journal = "Parallel Computer Routing and Communication. Second International Workshop, PCRCW'97. Proceedings",
	keywords = "message passing;multiprocessor interconnection networks;network topology;pipeline processing;resource allocation;shared memory systems;",
	note = "virtual channels;networks of workstations;irregular topology;processor interconnection;flow control mechanism;link wires;channel pipelining;control flits;traffic minimization;physical bandwidth assignment;message transmission;simulations;adaptive routing;",
	pages = "203 - 16",
	title = "{O}n the use of virtual channels in networks of workstations with irregular topology",
	year = 1998
}

Federico Silla, Antonio Robles and Jose Duato. Improving performance of networks of workstations by using Disha Concurrent. 1998, 80 - 7. URL BibTeX

@conference{6034697,
	author = "Silla, Federico and Robles, Antonio and Duato, Jose",
	abstract = "Networks of workstations are currently emerging as a cost-effective alternative to parallel computers. Recently, deadlock recovery techniques have been shown to be an alternative to deadlock avoidance. Disha Concurrent is a progressive deadlock recovery scheme able to simultaneously redirect several deadlocked messages through a deadlock-free lane. Unlike deadlock avoidance techniques, Disha provides true fully adaptive routing without using virtual channels to guarantee deadlock freedom. In this paper, we analyze the application of Disha to networks of workstations. We propose an implementation of Disha on irregular networks that allows concurrent deadlock recovery proving that this implementation is always able to recover from deadlock. A new switch organization and a new flow control protocol are proposed to support Disha. Performance evaluation results show that applying Disha to irregular networks increases network throughput by a factor of up to 3.5, and also reduces latency with regard to other routing algorithms based on deadlock avoidance techniques",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings. 1998 International Conference on Parallel Processing (Cat. No.98EX205)",
	keywords = "concurrency control;local area networks;parallel processing;performance evaluation;system recovery;workstations;",
	note = "performance improvement;networks of workstations;Disha Concurrent;deadlock recovery techniques;deadlock avoidance;flow control protocol;latency;",
	pages = "80 - 7",
	title = "{I}mproving performance of networks of workstations by using {D}isha {C}oncurrent",
	url = "http://dx.doi.org/10.1109/ICPP.1998.708466",
	year = 1998
}

Federico Silla, Antonio Robles and Jose Duato. Improving performance of networks of workstations by using Disha Concurrent. In TH Lai (ed.). 1998 INTERNATIONAL CONFERENCE ON PARALLEL PROCESSING - PROCEEDINGS. 1998, 80-87. BibTeX

@conference{ISI:000075698400010,
	author = "Silla, Federico and Robles, Antonio and Duato, Jose",
	abstract = "Networks of workstations are currently emerging as a cost-effective alternative to parallel computers. Recently, deadlock recovery techniques have been shown to be an alternative to deadlock avoidance. Disha Concurrent is a progressive deadlock recovery scheme able to simultaneously redirect several deadlocked messages through a deadlock-free lane. Unlike deadlock avoidance techniques, Disha provides true fully adaptive routing without using virtual channels to guarantee deadlock freedom. In this paper, we analyze the application of Disha to networks of workstations. We propose an implementation of Disha on irregular networks that allows concurrent deadlock recovery, proving that this implementation is always able to recover from deadlock. A new switch organization and a new flow control protocol are proposed to support Disha. Performance evaluation results shaw that applying Disha to irregular networks increases network throughput by a factor of up to 3.5, and also reduces latency with regard to other routing algorithms based on deadlock avoidance techniques.",
	booktitle = "1998 INTERNATIONAL CONFERENCE ON PARALLEL PROCESSING - PROCEEDINGS",
	editor = "Lai, TH",
	isbn = 0818686510,
	issn = "0190-3918",
	note = "International Conference on Parallel Processing (ICPP), MINNEAPOLIS, MN, AUG 10-14, 1998",
	pages = "80-87",
	series = "PROCEEDINGS OF THE INTERNATIONAL CONFERENCE ON PARALLEL PROCESSING",
	title = "{I}mproving performance of networks of workstations by using {D}isha {C}oncurrent",
	year = 1998
}

Federico Silla, M P Malumbres, Jose Duato, D Dai and D K Panda. Impact of adaptivity on the behavior of networks of workstations under bursty traffic. 1998, 88 - 95. URL BibTeX

@conference{6034698,
	author = "Silla, Federico and M.P. Malumbres and Duato, Jose and D. Dai and D.K. Panda",
	abstract = "Networks of workstations (NOWs) are becoming increasingly popular as an alternative to parallel computers. Typically, these networks present irregular topologies, providing the wiring flexibility, scalability, and incremental expansion capability required in this environment. Similar to the evolution of parallel computers, NOWs are also evolving from distributed memory to shared memory. However distances between processors are longer in NOWs, leading to higher message latency and lower network bandwidth. Therefore, one can expect the network to be a bottleneck when executing some parallel applications on a NOW supporting a shared-memory programming paradigm. The authors analyze whether the interconnection network in a NOW is able to efficiently handle the traffic generated in a DSM with the same number of processors. They evaluate the behavior of a NOW using application traces captured during the execution of several SPLASH2 applications on a DSM simulator. They show through simulation that the adaptive routing algorithm previously proposed by them almost eliminates network saturation due to its ability to support a higher sustained throughput. Therefore, adaptive routing becomes a key design issue to achieve similar performance in NOWs and tightly-coupled DSMs",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings. 1998 International Conference on Parallel Processing (Cat. No.98EX205)",
	keywords = "distributed memory systems;local area networks;parallel processing;shared memory systems;telecommunication network routing;telecommunication traffic;virtual machines;workstations;",
	note = "workstation network behaviour;bursty traffic;adaptivity;irregular topologies;wiring flexibility;wiring scalability;incremental expansion capability;distributed memory;shared memory;message latency;network bandwidth;parallel applications;shared-memory programming paradigm;interconnection network;traffic handling;application traces;SPLASH2 applications;simulator;adaptive routing algorithm;network saturation;",
	pages = "88 - 95",
	title = "{I}mpact of adaptivity on the behavior of networks of workstations under bursty traffic",
	url = "http://dx.doi.org/10.1109/ICPP.1998.708467",
	year = 1998
}

Federico Silla and Jose Duato. Tuning the number of virtual channels in networks of workstations. 1997, 72 - 5. BibTeX

@conference{5870025,
	author = "Silla, Federico and Duato, Jose",
	abstract = "Networks of workstations (NOWs) are becoming increasingly popular as a cost-effective alternative to parallel computers. Typically, these networks connect processors using switch-based interconnects with irregular topology. We proposed a design methodology as well as fully adaptive routing algorithms for irregular topologies. These algorithms require the use of, at least, two virtual channels. We have also proposed a very efficient flow control mechanism to support virtual channels in the environment of irregular networks with varying wire lengths. We study the effect that additional virtual channels have on the performance of irregular networks built using the routing algorithms and the flow control mechanism. Results reveal that the optimal number of virtual channels per physical channel varies with network size",
	address = "Raleigh, NC, USA",
	journal = "Proceedings of the ISCA 10th International Conference on Parallel and Distributed Computing Systems",
	keywords = "local area networks;multiprocessor interconnection networks;performance evaluation;telecommunication channels;telecommunication network routing;",
	note = "virtual channel tuning;workstation networks;cost effective;parallel computers;processor interconnection networks;switch based interconnects;irregular topology;design methodology;adaptive routing algorithms;flow control;varying wire length;network performance;routing algorithms;network size;wormhole switching;",
	pages = "72 - 5",
	title = "{T}uning the number of virtual channels in networks of workstations",
	year = 1997
}

Federico Silla and Jose Duato. Improving the efficiency of adaptive routing in networks with irregular topology. 1997, 330 - 5. URL BibTeX

@conference{5767661,
	author = "Silla, Federico and Duato, Jose",
	abstract = "Networks of workstations are emerging as a cost-effective alternative to parallel computers. The interconnection between workstations usually relies on switch-based networks with irregular topologies. This irregularity makes routing and deadlock avoidance quite complicated. Current proposals avoid deadlock by removing cyclic dependencies between channels and therefore, many messages are routed along non-minimal paths, increasing latency and wasting resources. We propose a general methodology for the design of adaptive routing algorithms for networks with irregular topology that improves a previously proposed one by reducing the probability of routing over non-minimal paths. The resulting routing algorithms allow messages to follow minimal paths in most cases, reducing message latency and increasing network throughput. As an example of application, we propose an improved adaptive routing algorithm for Autonet",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings. Fourth International Conference on High-Performance Computing (Cat. No.97TB100185)",
	keywords = "concurrency control;graph theory;local area networks;message switching;parallel processing;performance evaluation;telecommunication network routing;",
	note = "adaptive routing;irregular topology networks;workstation networks;cost-effective;parallel computers;switch-based networks;deadlock avoidance;cyclic dependencies;message routing;latency;probability;minimal paths;message latency;network throughput;Autonet;local area networks;",
	pages = "330 - 5",
	title = "{I}mproving the efficiency of adaptive routing in networks with irregular topology",
	url = "http://dx.doi.org/10.1109/HIPC.1997.634511",
	year = 1997
}

Federico Silla and Jose Duato. Improving the efficiency of adaptive routing in networks with irregular topology. 1997, 330 - 335. BibTeX

@conference{1998104020145,
	author = "Silla, Federico and Duato, Jose",
	abstract = "Networks of workstations are emerging as a cost-effective alternative to parallel computers. The interconnection between workstations usually relies on switch-based networks with irregular topologies. This irregularity makes routing and deadlock avoidance quite complicated. Current proposals avoid deadlock by removing cyclic dependencies between channels and therefore, many messages are routed along non-minimal paths, increasing latency and wasting resources. In this paper, we propose a general methodology for the design of adaptive routing algorithms for networks with irregular topology that improves over a previously proposed one by reducing the probability of routing over non-minimal paths. The resulting routing algorithms allow messages to follow minimal paths in most cases, reducing message latency and increasing network throughput. As an example of application, we propose an improved adaptive routing algorithm for Autonet.",
	address = "Bangalore, India",
	journal = "Proceedings of the International Conference on High Performance Computing, HiPC",
	key = "Computer networks",
	keywords = "Adaptive algorithms;Communication channels (information theory);Computer system recovery;Computer workstations;Congestion control (communication);Electric network topology;Probability;Response time (computer systems);Switching circuits;",
	note = "Adaptive routing algorithms;",
	pages = "330 - 335",
	title = "{I}mproving the efficiency of adaptive routing in networks with irregular topology",
	year = 1997
}

Jose Duato, Pedro Lopez, Federico Silla and S Yalamanchili. A high performance router architecture for interconnection networks. 1996, 61 - 8. URL BibTeX

@conference{5376067,
	author = "Duato, Jose and Lopez, Pedro and Silla, Federico and S. Yalamanchili",
	abstract = "We propose a new router architecture that supports wormhole switching and circuit switching concurrently. This architecture has been designed to take advantage of temporal communication locality. This can be done by establishing a circuit between nodes that are going to communicate frequently. Messages using those circuits face no contention. By combining circuit switching, pre-established physical circuits and wave pipelining across channels and switches, it is possible to increase network bandwidth considerably, also reducing latency for communications that use pre-established physical circuits. This router architecture also allows to reduce the overhead of the software messaging layer in multicomputers by offering a better hardware support. Preliminary performance evaluation results show a drastic reduction in latency and increment in throughput when messages are long enough, even if circuits are established for a single transmission and locality is not exploited",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings of the 1996 International Conference on Parallel Processing. Vol.1 Architecture",
	keywords = "message passing;multiprocessor interconnection networks;parallel architectures;performance evaluation;",
	note = "high performance router architecture;interconnection networks;wormhole switching;circuit switching;temporal communication locality;router architecture;software messaging layer;performance evaluation;",
	pages = "61 - 8",
	title = "{A} high performance router architecture for interconnection networks",
	url = "http://dx.doi.org/10.1109/ICPP.1996.537144",
	volume = "vol.1",
	year = 1996
}

Thesis

Addressing Manufacturing Challenges in NoC-based ULSI Designs. Jose Duato, Federico Silla (Network-On-Chip)

Floorplan-Aware High Performance NoC Design. Jose Flich, Federico Silla (Network-On-Chip)

Routing and flow control in networks of workstations. Jose Duato (Networks of Workstations)

On the Enhancement of Remote GPU Virtualization in High Performance Clusters. Jose Duato, Federico Silla (High Performance Clusters)