Publications 2010-2014

Josué Feliu, Salvador Petit, Julio Sahuquillo and Jose Duato. Cache-hierarchy Contention Aware Scheduling in CMPs. IEEE Transactions on Parallel and Distributed Systems 25(3):581 - 590, March 2014. DOI BibTeX

@article{ dblp:journals/tpds/josue2013,
	author = "Feliu, Josu{\'e} and Petit, Salvador and Sahuquillo, Julio and Duato, Jose",
	doi = "10.1109/TPDS.2013.61",
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	month = "March",
	number = 3,
	pages = "581 - 590",
	title = "{C}ache-hierarchy {C}ontention {A}ware {S}cheduling in {CMP}s",
	volume = 25,
	year = 2014
}

José Luis March, Salvador Petit, Julio Sahuquillo and Houcine Hassan Mohamed. Dynamic WCET Estimation for Real-Time Multicore Embedded Systems Supporting DVFS. 2014 IEEE International Conference on High Performance Computing and Communications (HPCC), 2014. BibTeX

@article{ 10.1109/hpcc.2014.11,
	author = "March, Jos{\'e} Luis and Petit, Salvador and Sahuquillo, Julio and Mohamed, Houcine Hassan",
	journal = "2014 IEEE International Conference on High Performance Computing and Communications (HPCC)",
	title = "{D}ynamic {WCET} {E}stimation for {R}eal-{T}ime {M}ulticore {E}mbedded {S}ystems {S}upporting {DVFS}",
	year = 2014
}

Salvador Petit, Rafael Ubal, Julio Sahuquillo and Pedro Lopez. Efficient Register Renaming and Recovery for High-Performance Processors. IEEE Transactions on Very Large Scale Integration (VLSI) Systems 7(22):1506-1514, 2014. BibTeX

@article{ 10.1109/tvlsi.2013.2270001,
	author = "Petit, Salvador and Ubal, Rafael and Sahuquillo, Julio and Lopez, Pedro",
	abstract = "Modern superscalar processors implement register renaming using either random access memory (RAM) or content-addressable memories (CAM) tables. The design of these structures should address both access time and misprediction recovery penalty. Although direct-mapped RAMs provide faster access times, CAMs are more appropriate to avoid recovery penalties. The presence of associative ports in CAMs, however, prevents them from scaling with the number of physical registers and pipeline width, negatively impacting performance, area, and energy consumption at the rename stage. In this paper, we present a new hybrid RAM–CAM register renaming scheme, which combines the best of both approaches. In a steady state, a RAM provides fast and energy-efficient access to register mappings. On misspeculation, a low-complexity CAM enables immediate recovery. Experimental results show that in a four-way state-of-the-art superscalar processor, the new approach provides almost the same performance as an ideal CAM-based renaming scheme, while dissipating only between 17% and 26% of the original energy and, in some cases, consuming less energy than purely RAM-based renaming schemes. Overall, the silicon area required to implement the hybrid RAM–CAM scheme does not exceed the area required by conventional renaming mechanisms.",
	journal = "IEEE Transactions on Very Large Scale Integration (VLSI) Systems",
	number = 22,
	pages = "1506-1514",
	title = "{E}fficient {R}egister {R}enaming and {R}ecovery for {H}igh-{P}erformance {P}rocessors",
	volume = 7,
	year = 2014
}

Sergio Iserte, Adrián Castelló Gimeno, Rafael Mayo, Enrique S Quintana-Ortí, Federico Silla, Jose Duato, Carlos Reaño and Javier Prades. SLURM Support for Remote GPU Virtualization: Implementation and Performance Study. In 26th IEEE International Symposium on Computer Architecture and High Performance Computing, SBAC-PAD 2014, Paris, France, October 22-24, 2014. 2014, 318–325. URL, DOI BibTeX

@conference{ dblp:conf/sbac-pad/isertegmqsdrp14,
	author = "Sergio Iserte and Adri{\'a}n Castell{\'o} Gimeno and Rafael Mayo and Enrique S. Quintana-Ort{\'i} and Silla, Federico and Duato, Jose and Rea{\~n}o, Carlos and Prades, Javier",
	booktitle = "26th IEEE International Symposium on Computer Architecture and High Performance Computing, SBAC-PAD 2014, Paris, France, October 22-24, 2014",
	crossref = "DBLP:conf/sbac-pad/2014",
	doi = "10.1109/SBAC-PAD.2014.49",
	pages = "318--325",
	title = "{SLURM} {S}upport for {R}emote {GPU} {V}irtualization: {I}mplementation and {P}erformance {S}tudy",
	url = "http://dx.doi.org/10.1109/SBAC-PAD.2014.49",
	year = 2014
}

Roberto Pe naranda, Crispín Gómez Requena, María Engracia Gómez, Pedro López and José Duato. Deterministic Routing with HoL-Blocking-Awareness for Direct Topologies. In ICCS. 2013, 2521-2524. BibTeX

@conference{,
	author = "Roberto Pe naranda and Crisp{\'i}n G{\'o}mez Requena and Mar{\'i}a Engracia G{\'o}mez and Pedro L{\'o}pez and Jos{\'e} Duato",
	booktitle = "ICCS",
	pages = "2521-2524",
	title = "{D}eterministic {R}outing with {H}o{L}-{B}locking-{A}wareness for {D}irect {T}opologies",
	year = 2013
}

@conference{ dblp:conf/iccs/penarandaggld13,
	author = "Roberto Pe naranda and Crisp{\'i}n G{\'o}mez Requena and Mar{\'i}a Engracia G{\'o}mez and Pedro L{\'o}pez and Jos{\'e} Duato",
	booktitle = "ICCS",
	crossref = "DBLP:conf/iccS/2013",
	pages = "2521-2524",
	title = "{D}eterministic {R}outing with {H}o{L}-{B}locking-{A}wareness for {D}irect {T}opologies",
	year = 2013
}

Carlos Reaño, Antonio José Peña, Federico Silla, Rafa Mayo, Enrique S Quintana-Ortí and Jose Duato. Influence of InfiniBand FDR on the Performance of Remote GPU Virtualization. In International Conference on Cluster Computing (Cluster). 2013. BibTeX

@conference{ reanoinfluence,
	author = "Rea{\~n}o, Carlos and Pe{\~n}a, Antonio Jos{\'e} and Silla, Federico and Rafa Mayo and Enrique S. Quintana-Ort{\'i} and Duato, Jose",
	booktitle = "International Conference on Cluster Computing (Cluster)",
	title = "{I}nfluence of {I}nfini{B}and {FDR} on the {P}erformance of {R}emote {GPU} {V}irtualization",
	year = 2013
}

Josué Feliu, Julio Sahuquillo, Salvador Petit and Jose Duato. L1-Bandwidth Aware Thread Allocation in Multicore SMT Processors. In 22nd International Conference on Parallel Architectures and Compilation Techniques, PACT'13, Edinburgh, United Kingdom, Sep 7-11. 2013, 123-132. BibTeX

@conference{ pact/feliu/13,
	author = "Feliu, Josu{\'e} and Sahuquillo, Julio and Petit, Salvador and Duato, Jose",
	booktitle = "22nd International Conference on Parallel Architectures and Compilation Techniques, PACT'13, Edinburgh, United Kingdom, Sep 7-11",
	isbn = "978-1-4799-1021-2",
	pages = "123-132",
	title = "{L}1-{B}andwidth {A}ware {T}hread {A}llocation in {M}ulticore {SMT} {P}rocessors",
	year = 2013
}

Josué Feliu, Julio Sahuquillo, Salvador Petit and Jose Duato. Planificación Considerando Degradación de Prestaciones por Contención. In XXIV Jornadas de Paralelismo, JP 2013, Madrid, Sep 17-20. 2013, 62-67. BibTeX

@conference{ jp/feliu/13,
	author = "Feliu, Josu{\'e} and Sahuquillo, Julio and Petit, Salvador and Duato, Jose",
	booktitle = "XXIV Jornadas de Paralelismo, JP 2013, Madrid, Sep 17-20",
	isbn = "978-84-695-8330-2",
	pages = "62-67",
	title = "{P}lanificaci{\'o}n {C}onsiderando {D}egradaci{\'o}n de {P}restaciones por {C}ontenci{\'o}n",
	year = 2013
}

Vassil N Alexandrov, Michael Lees, Valeria V Krzhizhanovskaya, Jack Dongarra and Peter M A Sloot (eds.). Proceedings of the International Conference on Computational Science, ICCS 2013, Barcelona, Spain, 5-7 June, 2013 18. Elsevier, 2013. BibTeX

@proceedings{ dblp:conf/iccs/2013,
	author = "",
	booktitle = "ICCS",
	editor = "Vassil N. Alexandrov and Michael Lees and Valeria V. Krzhizhanovskaya and Jack Dongarra and Peter M. A. Sloot",
	publisher = "Elsevier",
	series = "Procedia Computer Science",
	title = "{P}roceedings of the {I}nternational {C}onference on {C}omputational {S}cience, {ICCS} 2013, {B}arcelona, {S}pain, 5-7 {J}une, 2013",
	volume = 18,
	year = 2013
}

Josué Feliu, Julio Sahuquillo, Salvador Petit and Jose Duato. Using huge pages and performance counters to determine the LLC architecture. In International Conference on Computational Science, ICCS'13, Barcelona, Jun 5-7. 2013, 2557-2560. BibTeX

@conference{ josue_iccs_2013,
	author = "Feliu, Josu{\'e} and Sahuquillo, Julio and Petit, Salvador and Duato, Jose",
	booktitle = "International Conference on Computational Science, ICCS'13, Barcelona, Jun 5-7",
	pages = "2557-2560",
	title = "{U}sing huge pages and performance counters to determine the {LLC} architecture",
	year = 2013
}

Roberto Peñaranda, Crispín Gomez, Maria E Gomez and Pedro Lopez. A New Family of Hybrid Topologies for Large-Scale Interconnection Networks. Network Computing and Applications (NCA), 2012 11th IEEE International Symposium on, 2012. BibTeX

@article{ 10.1109/nca.2012.22,
	author = "Pe{\~n}aranda, Roberto and Gomez, Crisp{\'i}n and Gomez, Maria E. and Lopez, Pedro",
	abstract = "In large supercomputers the topology of the interconnection network is a key design issue that impacts the performance and cost of the whole system. Direct topologies provide a reduced hardware cost, but as the number of dimensions is conditioned by 3D wiring restrictions, a high number of nodes per dimension is used, which increases communication latency and reduces network throughput. On the other hand, indirect topologies can provide better performance for large network sizes, but at the cost of a high amount of switches and links. In this paper we propose a new family of topologies that combines the best features of both direct and indirect topologies to efficiently connect an extremely high number of nodes. In particular, we propose an n-dimensional topology where the nodes of each dimension are connected through a small indirect topology. This combination results in a family of topologies that provides high performance, with latency and throughput figures of merit close to indirect topologies, but with a lower hardware cost. In particular, it is able to double the throughput obtained per switching element of indirect topologies. Moreover, the layout of the topology is much simpler than in indirect topologies. Indeed, its fault-tolerance degree is equal or higher than the one for direct and indirect topologies.",
	journal = "Network Computing and Applications (NCA), 2012 11th IEEE International Symposium on",
	title = "{A} {N}ew {F}amily of {H}ybrid {T}opologies for {L}arge-{S}cale {I}nterconnection {N}etworks",
	year = 2012
}

Roberto Peñaranda, Crispín Gomez, Maria E Gomez, Pedro Lopez and Jose Duato. A New Family of Hybrid Topologies for Large-Scale Interconnection Networks. IEEE 11th International Symposium on Network Computing and Applications, pages 220-227, August 2012. BibTeX

@article{ hybridtopology,
	author = "Pe{\~n}aranda, Roberto and Gomez, Crisp{\'i}n and Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "In large supercomputers the topology of the interconnection network is a key design issue that impacts the performance and cost of the whole system. Direct topologies provide a reduced hardware cost, but, as the number of dimensions is conditioned by 3D wiring restrictions, a high number of nodes per dimension is used, which increases communication latency and reduces network throughput. On the other hand, indirect topologies can provide better performance for large network sizes, but at the cost of a high number of switches and links. In this paper, we propose a new family of topologies that combines the best features of both direct and indirect topologies to efficiently connect an extremely high number of nodes. In particular, we propose an n–dimensional topology, where the nodes of each dimension are connected through a small indirect topology. This combination results in a family of topologies that provides high performance, with latency and throughput figures of merit close to indirect topologies, but with a lower hardware cost. In particular, it is able to double the throughput obtained per switching element of indirect topologies. Moreover, the layout of the topology is much simpler than in indirect topologies. Indeed, its fault–tolerance degree is equal or higher than the one for direct and indirect topologies.",
	journal = "IEEE 11th International Symposium on Network Computing and Applications",
	keywords = "routing algorithm, direct topology, indirect topology",
	month = "August",
	pages = "220-227",
	title = "{A} {N}ew {F}amily of {H}ybrid {T}opologies for {L}arge-{S}cale {I}nterconnection {N}etworks",
	year = 2012
}

Carlos Reaño, Federico Silla and Germán Vidal. CU2rCU: A CUDA-to-rCUDA Converter. Universitat Politècnica de València, Spain, 2012. URL BibTeX

@mastersthesis{ cu2rcu_master,
	author = "Rea{\~n}o, Carlos and Silla, Federico and Germ{\'a}n Vidal",
	address = "Spain",
	school = "Universitat Polit{\`e}cnica de Val{\`e}ncia",
	title = "{CU}2r{CU}: {A} {CUDA}-to-r{CUDA} {C}onverter",
	url = "http://hdl.handle.net/10251/27435",
	year = 2012
}

Carlos Reaño, Antonio José Peña, Federico Silla, R Mayo, E S Quintana-Ortí and Jose Duato. CU2rCU: towards the Complete rCUDA Remote GPU Virtualization and Sharing Solution. In 19th Annual International Conference on High Performance Computing (HiPC 2012). 2012. BibTeX

@conference{ cu2rcu_hipc2012,
	author = "Rea{\~n}o, Carlos and Pe{\~n}a, Antonio Jos{\'e} and Silla, Federico and Mayo, R. and Quintana-Ort{\'i}, E. S. and Duato, Jose",
	booktitle = "19th Annual International Conference on High Performance Computing (HiPC 2012)",
	title = "{CU}2r{CU}: towards the {C}omplete r{CUDA} {R}emote {GPU} {V}irtualization and {S}haring {S}olution",
	year = 2012
}

@conference{ cu2rcu_hipc12,
	author = "Rea{\~n}o, Carlos and Pe{\~n}a, Antonio Jos{\'e} and Silla, Federico and Mayo, R. and Quintana-Ort{\'i}, E. S. and Duato, Jose",
	booktitle = "19th Annual International Conference on High Performance Computing (HiPC)",
	month = "December",
	title = "{CU}2r{CU}: towards the {C}omplete r{CUDA} {R}emote {GPU} {V}irtualization and {S}haring {S}olution",
	url = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp={\&}arnumber=6507485{\&}isnumber=6507469",
	year = 2012
}

Roberto Pe naranda, Crispín Gómez Requena, María Engracia Gómez, Pedro López and José Duato. IODET: A HoL-blocking-aware Deterministic Routing Algorithm for Direct Topologies. In ICPADS. 2012, 702-703. BibTeX

@conference{ dblp:conf/icpads/penarandaggld12,
	author = "Roberto Pe naranda and Crisp{\'i}n G{\'o}mez Requena and Mar{\'i}a Engracia G{\'o}mez and Pedro L{\'o}pez and Jos{\'e} Duato",
	booktitle = "ICPADS",
	pages = "702-703",
	title = "{IODET}: {A} {H}o{L}-blocking-aware {D}eterministic {R}outing {A}lgorithm for {D}irect {T}opologies",
	year = 2012
}

Carles Hernández, Antoni Roca, Federico Silla, Jose Flich and Jose Duato. On the Impact of Within-Die Process Variation in GALS-Based NoC Performance. IEEE Trans. on CAD of Integrated Circuits and Systems 31(2):294-307, 2012. BibTeX

@article{ dblp:journals/tcad/hernandezrsfd12,
	author = "Hern{\'a}ndez, Carles and Roca, Antoni and Silla, Federico and Flich, Jose and Duato, Jose",
	journal = "IEEE Trans. on CAD of Integrated Circuits and Systems",
	number = 2,
	pages = "294-307",
	title = "{O}n the {I}mpact of {W}ithin-{D}ie {P}rocess {V}ariation in {GALS}-{B}ased {N}o{C} {P}erformance",
	volume = 31,
	year = 2012
}

Josué Feliu, Julio Sahuquillo, Salvador Petit and Jose Duato. Planificació considerando el ancho de banda de la jerarquía de cache. In XIII Jornadas de Paralelismo, JP 2012, Elche, Sep 19-21. 2012, 472-477. BibTeX

@conference{ jp/feliu/12,
	author = "Feliu, Josu{\'e} and Sahuquillo, Julio and Petit, Salvador and Duato, Jose",
	booktitle = "XIII Jornadas de Paralelismo, JP 2012, Elche, Sep 19-21",
	isbn = "978-84-695-4473-0",
	pages = "472-477",
	title = "{P}lanificaci{\'o} considerando el ancho de banda de la jerarqu{\'i}a de cache",
	year = 2012
}

Josué Feliu, Julio Sahuquillo, Salvador Petit and Jose Duato. Understanding Cache Hierarchy Contention in CMPs to Improve Job Scheduling. In 26th IEEE International Parallel and Distributed Processing Symposium, IPDPS 2012, Shanghai, China, May 21-25. 2012, 508-519. BibTeX

@conference{ dblp:conf/ipps/feliuspd12,
	author = "Feliu, Josu{\'e} and Sahuquillo, Julio and Petit, Salvador and Duato, Jose",
	booktitle = "26th IEEE International Parallel and Distributed Processing Symposium, IPDPS 2012, Shanghai, China, May 21-25",
	isbn = "978-1-4673-0975-2",
	pages = "508-519",
	title = "{U}nderstanding {C}ache {H}ierarchy {C}ontention in {CMP}s to {I}mprove {J}ob {S}cheduling",
	year = 2012
}

Monica Serrano, Julio Sahuquillo, Salvador Petit, Houcine Hassan and Jose Duato. A cost-effective heuristic to schedule local and remote memory in cluster computers. Journal of Supercomputing, pages 1 - 19, 2011. URL BibTeX

@article{ ip51286180,
	author = "Serrano, Monica and Sahuquillo, Julio and Petit, Salvador and Houcine Hassan and Duato, Jose",
	abstract = "Cluster computers represent a cost-effective alternative solution to supercomputers. In these systems, it is common to constrain the memory address space of a given processor to the local motherboard. Constraining the system in this way is much cheaper than using a full-fledged shared memory implementation among motherboards. However, memory usage among motherboards can be unfairly balanced. On the other hand, remote memory access (RMA) hardware provides fast interconnects among the motherboards of a cluster. RMA devices can be used to access remote RAM memory from a local motherboard. This work focuses on this capability in order to achieve a better global use of the total RAM memory in the system. More precisely, the address space of local applications is extended to remote motherboards and is used to access remote RAM memory. This paper presents an ideal memory scheduling algorithm and proposes a cost-effective heuristic to allocate local and remote memory among local applications. Compared to the devised ideal algorithm, the heuristic obtains the same (or closely resembling) results while largely reducing the computational cost. In addition, we analyze the impact on the performance of stand alone applications varying the memory distribution among regions (local, local to board, and remote). Then, this study is extended to any number of concurrent applications. Experimental results show that a QoS parameter is needed in order to avoid unacceptable performance degradation. {\&}copy; 2011 Springer Science+Business Media, LLC.",
	issn = 09208542,
	journal = "Journal of Supercomputing",
	key = "Multitasking",
	keywords = "Cost effectiveness;Costs;Printed circuits;Random access storage;Scheduling algorithms;Supercomputers;",
	note = "Address space;Cluster computer;Computational costs;Global use;Memory address space;Memory usage;Performance degradation;QoS parameters;Remote memory;Remote memory access;Shared memories;Standalone applications;Work Focus;",
	pages = "1 - 19",
	title = "{A} cost-effective heuristic to schedule local and remote memory in cluster computers",
	url = "http://dx.doi.org/10.1007/s11227-011-0566-8",
	year = 2011
}

Antoni Roca, Carles Hernández, Jose Flich, Federico Silla and Jose Duato. A Distributed Switch Architecture for On-Chip Networks. In Parallel Processing (ICPP), 2011 International Conference on. 2011, 21 -30. DOI BibTeX

@conference{ 6047169,
	author = "Roca, Antoni and Hern{\'a}ndez, Carles and Flich, Jose and Silla, Federico and Duato, Jose",
	abstract = "It is well-known that current Chip Multiprocessor (CMP) and high-end MultiProcessor System-on-Chip (MPSoC) designs are growing in their number of components. Networks-on-Chip (NoC) provide the required connectivity for such CMP and MPSoC designs at reasonable costs. However, as technology advances, links become the critical component in the NoC. First, because the power consumption of the link is extremely high with respect the power consumption of the rest of components (mainly switches), becoming unacceptable for long global interconnects. Second, the delay of a link does not scale with technology, thus, degrading the performance of the network. To solve both problems, several solutions have been previously proposed. In this paper, we present a new switch architecture that reduces the negative impact of links on the NoC. We call our proposal distributed switch. The distributed switch moves the circuitry of a standard switch onto the links. Then, packets are buffered, routed, and forwarded at the same time they are crossing the link. Distributing a standard switch onto the link improves the trade off between the power consumption and the operating frequency of the entire network. In contrast, area requirements are increased. The distributed switch reduces up to 14.8 #x025; the peak power consumption while increases its area up to 22 #x025;. Furthermore, the distributed switch is able to increase the maximum achievable frequency with respect to the standard switch. In particular, the maximum operating frequency of the distributed switch can be increased up to 14.3 #x025;.",
	booktitle = "Parallel Processing (ICPP), 2011 International Conference on",
	doi = "10.1109/ICPP.2011.28",
	issn = "0190-3918",
	month = "sept.",
	pages = "21 -30",
	title = "{A} {D}istributed {S}witch {A}rchitecture for {O}n-{C}hip {N}etworks",
	year = 2011
}

Héctor Montaner, Federico Silla, Holger Froning and Jose Duato. A new degree of freedom for memory allocation in clusters. Cluster Computing, pages 1 - 23, 2011. URL BibTeX

@article{ ip51265029,
	author = "Montaner, H{\'e}ctor and Silla, Federico and Holger Froning and Duato, Jose",
	abstract = "Improvements in parallel computing hardware usually involve increments in the number of available resources for a given application such as the number of computing cores and the amount of memory. In the case of shared-memory computers, the increase in computing resources and available memory is usually constrained by the coherency protocol, whose overhead rises with system size, limiting the scalability of the final system. In this paper we propose an efficient and cost-effective way to increase the memory available for a given application by leveraging free memory in other computers in the cluster. Our proposal is based on the observation that many applications benefit from having more memory resources but do not require more computing cores, thus reducing the requirements for cache coherency and allowing a simpler implementation and better scalability. Simulation results show that, when additional mechanisms intended to hide remote memory latency are used, execution time of applications that use our proposal is similar to the time required to execute them in a computer populated with enough local memory, thus validating the feasibility of our proposal. We are currently building a prototype that implements our ideas. The first results from real executions in this prototype demonstrate not only that our proposal works but also that it can efficiently execute applications that make use of remote memory resources. {\&}copy; 2011 Springer Science+Business Media, LLC.",
	issn = 13867857,
	journal = "Cluster Computing",
	key = "Computer simulation",
	keywords = "Parallel architectures;Scalability;",
	note = "Cache coherency;Computing resource;Degree of freedom;Execution time;Free memory;Local memories;Memory allocation;Memory resources;Parallel Computing;Remote memory;Shared-memory computers;Simulation result;System size;",
	pages = "1 - 23",
	title = "{A} new degree of freedom for memory allocation in clusters",
	url = "http://dx.doi.org/10.1007/s10586-010-0150-7",
	year = 2011
}

, Jose Flich, Jose Duato, H Eberle and W Olesinski. A power-efficient network on-chip topology. In Proceedings of the Fifth International Workshop on Interconnection Network Architecture: On-Chip, Multi-Chip. 2011, 23–26. URL, DOI BibTeX

@conference{ camacho:2011:pno:1930037.1930044,
	author = ", and Flich, Jose and Duato, Jose and H. Eberle and W. Olesinski",
	abstract = "NoCs have become a critical component in many-core architectures. Usually, the preferred topology is the 2D-Mesh as it enables a tile-based layout significantly reducing the design effort. However, new emerging challenges such as power consumption need to be addressed. Looking at the NoC, routers and links not being used must be switched off, thus achieving large power savings. Topology and routing algorithm must be carefully designed as they may lack enough flexibility to switch off components for long periods of time. We present the NR-Mesh (Nearest neighboR Mesh) topology. It gives an end node the choice to inject a message through different neighboring routers, thereby reducing hop count and saving latency. At the receiver side, a message may be delivered to the end node through different routers, thus reducing hop count further and increasing flexibility. When allowing links and routers to switch off and combined with adaptive routing, the power management technique is able to achieve significant power savings (up to 36% savings in static power consumed at routers). When compared with the 2D-Mesh, NR-Mesh reduces execution time by 23% and power consumption at routers by 47%.",
	address = "New York, NY, USA",
	booktitle = "Proceedings of the Fifth International Workshop on Interconnection Network Architecture: On-Chip, Multi-Chip",
	doi = "http://doi.acm.org/10.1145/1930037.1930044",
	isbn = "978-1-4503-0272-2",
	keywords = "Network-on-Chip; Power Efficient Chip Technology; Chip Topology; Routing Algorithms;",
	pages = "23--26",
	publisher = "ACM",
	series = "INA-OCMC '11",
	title = "{A} power-efficient network on-chip topology",
	url = "http://doi.acm.org/10.1145/1930037.1930044",
	year = 2011
}

Laura Bonel, Juan C Vidal, Patricia Duato and Juan R Castillo. An electrochemical competitive biosensor for ochratoxin A based on a DNA biotinylated aptamer. Biosensors and Bioelectronics 26(7):3254 - 3259, 2011. URL BibTeX

@article{ 20110813685655,
	author = "Laura Bonel and Juan C. Vidal and Patricia Duato and Juan R. Castillo",
	abstract = "Ochratoxin A (OTA) is one of the most important mycotoxin contaminants of foods, particularly cereals and cereal products, with strict low regulatory levels (of ppb) in many countries worldwide. An electrochemical competitive aptamer-based biosensor for OTA is described. Paramagnetic microparticle beads (MBs) were functionalized with an aptamer specific to OTA, and were allowed to compete with a solution of the mycotoxin conjugated to the enzyme horseradish peroxidase (OTA-HRP) and free OTA. After separation and washing steps helped with magnetic separations, the modified MBs were localized on disposable screen-printed carbon electrodes (SPCEs) under a magnetic field, and the product of the enzymatic reaction with the substrate was detected with differential-pulse voltammetry. In addition to magnetic separation assays, other competitive schemes (direct/indirect aptasensors performed on the SPCEs surface or using gold nanoparticles functionalized with the aptamer) were preliminary tested, optimized and compared. The magnetic aptasensor showed a linear response to OTA in the range 0.78-8.74ngmL^-1 and a limit of detection of 0.07{\&}plusmn;0.01ngmL^-1, and was accurately applied to extracts of certified and spiked wheat samples with an RSD lower than about 8%. {\&}copy; 2010 Elsevier B.V.",
	address = "Langford Lane, Kidlington, Oxford, OX5 1GB, United Kingdom",
	issn = 09565663,
	journal = "Biosensors and Bioelectronics",
	key = "Biosensors",
	keywords = "Cereal products;Electrochemistry;Electrodes;Magnetic fields;Magnetic separation;Nanomagnetics;Paramagnetism;",
	note = "Aptamers;Electrochemical biosensor;Mycotoxins;Ochratoxin A;Screen printed;",
	number = 7,
	pages = "3254 - 3259",
	title = "{A}n electrochemical competitive biosensor for ochratoxin {A} based on a {DNA} biotinylated aptamer",
	url = "http://dx.doi.org/10.1016/j.bios.2010.12.036",
	volume = 26,
	year = 2011
}

Carles Hernández, Antoni Roca, Jose Flich, Federico Silla and Jose Duato. Characterizing the impact of process variation on 45 nm NoC-based CMPs. Journal of Parallel and Distributed Computing 71(5):651 - 663, 2011. URL, DOI BibTeX

@article{ 20111413888254,
	author = "Hern{\'a}ndez, Carles and Roca, Antoni and Flich, Jose and Silla, Federico and Duato, Jose",
	abstract = "Current integration scales make possible to design chip multiprocessors with a large amount of cores interconnected by a NoC. Unfortunately, they also bring process variation, posing a new burden to processor manufacturers. Regarding the NoC, variability causes that the delays of links and routers do not match those initially established at design time. In this paper we analyze how variability affects the NoC by applying a new variability model to 100 instances of an 8 × 8 mesh NoC synthesized using 45 nm technology. We also show that GALS-based NoCs present communication bottlenecks due to the slower components of the network, which cause congestion, thus reducing performance. This performance reduction finally affects the applications being executed in the CMP because they may be mapped to slower areas of the chip. In this paper we show that using a mapping algorithm that considers variability data may improve application execution time up to 50%. © 2010 Elsevier Inc. All rights reserved.",
	address = "6277 Sea Harbor Drive, Orlando, FL 32887-4900, United States",
	doi = "10.1016/j.jpdc.2010.09.006",
	issn = "0743-7315",
	journal = "Journal of Parallel and Distributed Computing",
	key = "Routers",
	keywords = "Conformal mapping;Design;Microprocessor chips;Multiprocessing systems;Servers;Systems analysis;VLSI circuits;",
	note = "Chip Multiprocessor;NoC (or Network-on-Chip);Process mapping;Process variations;Router design;",
	number = 5,
	pages = "651 - 663",
	title = "{C}haracterizing the impact of process variation on 45 nm {N}o{C}-based {CMP}s",
	url = "http://dx.doi.org/10.1016/j.jpdc.2010.09.006",
	volume = 71,
	year = 2011
}

Jesus Escudero-Sahuquillo, Ernst Gunnar Gran, Pedro Javier Garcia, Jose Flich, Tor Skeie, Olav Lysne, Francisco Jose Quiles and Jose Duato. Combining Congested-Flow Isolation and Injection Throttling in HPC Interconnection Networks. In Parallel Processing (ICPP), 2011 International Conference on. 2011, 662 -672. DOI BibTeX

@conference{ 6047234,
	author = "Jesus Escudero-Sahuquillo and Ernst Gunnar Gran and Pedro Javier Garcia and Flich, Jose and Tor Skeie and Olav Lysne and Francisco Jose Quiles and Duato, Jose",
	abstract = "Existing congestion control mechanisms in interconnects can be divided into two general approaches. One is to throttle traffic injection at the sources that contribute to congestion, and the other is to isolate the congested traffic in specially designated resources. These two approaches have different, but non-overlapping weaknesses. In this paper we present in detail a method that combines injection throttling and congested-flow isolation. Through simulation studies we first demonstrate the respective flaws of the injection throttling and of flow isolation. Thereafter we show that our combined method extracts the best of both approaches in the sense that it gives fast reaction to congestion, it is scalable and it has good fairness properties with respect to the congested flows.",
	booktitle = "Parallel Processing (ICPP), 2011 International Conference on",
	doi = "10.1109/ICPP.2011.80",
	issn = "0190-3918",
	month = "sept.",
	pages = "662 -672",
	title = "{C}ombining {C}ongested-{F}low {I}solation and {I}njection {T}hrottling in {HPC} {I}nterconnection {N}etworks",
	year = 2011
}

J Escudero-Sahuquillo, P J Garcia, F J Quiles, Jose Flich and Jose Duato. Cost-effective queue schemes for reducing head-of-line blocking in fat-trees. Concurrency Computation Practice and Experience 12(15), 2011. URL, DOI BibTeX

@article{ ip51411971,
	author = "J. Escudero-Sahuquillo and P.J. Garcia and F.J. Quiles and Flich, Jose and Duato, Jose",
	abstract = "The fat-tree is one of the most common topologies among the interconnection networks of the systems currently used for high-performance parallel computing. Among other advantages, fat-trees allow the use of simple but very efficient routing schemes. One of them is a deterministic routing algorithm that has been recently proposed, offering a similar (or better) performance than adaptive routing while reducing complexity and guaranteeing in-order packet delivery. However, as other deterministic routing proposals, this deterministic routing algorithm cannot react when high traffic loads or hot-spot traffic scenarios produce severe contention for the use of network resources, leading to the appearance of Head-of-Line (HoL) blocking, which spoils the network performance. In that sense, we describe in this paper two simple, cost-effective strategies for dealing with the HoL-blocking problem that may appear in fat-trees with the aforementioned deterministic routing algorithm. From the results presented in the paper, we conclude that, in the mentioned environment, these proposals considerably reduce HoL-blocking without significantly increasing switch complexity and the required silicon area. © 2011 John Wiley {\&} Sons, Ltd.",
	doi = "10.1002/cpe.1764",
	issn = "1532-0626",
	journal = "Concurrency Computation Practice and Experience",
	key = "Trees (mathematics)",
	keywords = "Cost effectiveness;Network performance;Packet networks;Parallel architectures;Routing algorithms;",
	note = "Adaptive routing;Deterministic routing;Deterministic routing algorithms;Efficient routing;Head of line blocking;Hot-spot traffic;In-order packet delivery;Network resource;Parallel Computing;Silicon area;Switch complexity;Traffic loads;",
	number = 15,
	title = "{C}ost-effective queue schemes for reducing head-of-line blocking in fat-trees",
	url = "http://dx.doi.org/10.1002/cpe.1764",
	volume = 12,
	year = 2011
}

Samuel Rodrigo, Jose Flich, Antoni Roca, S Medardoni, D Bertozzi, , Federico Silla and Jose Duato. Cost-Efficient On-Chip Routing Implementations for CMP and MPSoC Systems. Computer-Aided Design of Integrated Circuits and Systems, IEEE Transactions on 30(4):534 -547, April 2011. URL, DOI BibTeX

@article{ 5737867,
	author = "Rodrigo, Samuel and Flich, Jose and Roca, Antoni and S. Medardoni and D. Bertozzi and , and Silla, Federico and Duato, Jose",
	abstract = "The high-performance computing domain is enriching with the inclusion of networks-on-chip (NoCs) as a key component of many-core (CMPs or MPSoCs) architectures. NoCs face the communication scalability challenge while meeting tight power, area, and latency constraints. Designers must address new challenges that were not present before. Defective components, the enhancement of application-level parallelism, or power-aware techniques may break topology regularity, thus, efficient routing becomes a challenge. This paper presents universal logic-based distributed routing (uLBDR), an efficient logic-based mechanism that adapts to any irregular topology derived from 2-D meshes, instead of using routing tables. uLBDR requires a small set of configuration bits, thus being more practical than large routing tables implemented in memories. Several implementations of uLBDR are presented highlighting the tradeoff between routing cost and coverage. The alternatives span from the previously proposed LBDR approach (with 30% of coverage) to the uLBDR mechanism achieving full coverage. This comes with a small performance cost, thus exhibiting the tradeoff between fault tolerance and performance. Power consumption, area, and delay estimates are also provided highlighting the efficiency of the mechanism. To do this, different router models (one for CMPs and one for MPSoCs) have been designed as a proof concept.",
	doi = "10.1109/TCAD.2011.2119150",
	issn = "0278-0070",
	journal = "Computer-Aided Design of Integrated Circuits and Systems, IEEE Transactions on",
	keywords = "Fault-tolerance , logic design , networks-on-chip , routing",
	month = "april",
	number = 4,
	pages = "534 -547",
	title = "{C}ost-{E}fficient {O}n-{C}hip {R}outing {I}mplementations for {CMP} and {MPS}o{C} {S}ystems",
	url = "http://dx.doi.org/10.1109/TCAD.2011.2119150",
	volume = 30,
	year = 2011
}

Samuel Rodrigo, Jose Flich, Antoni Roca, S Medardoni, D Bertozzi, , Federico Silla and Jose Duato. Cost-efficient on-chip routing implementations for CMP and MPSoC systems. 2011, 534 - 547. URL, DOI BibTeX

@conference{ 20111313880819,
	author = "Rodrigo, Samuel and Flich, Jose and Roca, Antoni and S. Medardoni and D. Bertozzi and , and Silla, Federico and Duato, Jose",
	abstract = "The high-performance computing domain is enriching with the inclusion of networks-on-chip (NoCs) as a key component of many-core (CMPs or MPSoCs) architectures. NoCs face the communication scalability challenge while meeting tight power, area, and latency constraints. Designers must address new challenges that were not present before. Defective components, the enhancement of application-level parallelism, or power-aware techniques may break topology regularity, thus, efficient routing becomes a challenge. This paper presents universal logic-based distributed routing (uLBDR), an efficient logic-based mechanism that adapts to any irregular topology derived from 2-D meshes, instead of using routing tables. uLBDR requires a small set of configuration bits, thus being more practical than large routing tables implemented in memories. Several implementations of uLBDR are presented highlighting the tradeoff between routing cost and coverage. The alternatives span from the previously proposed LBDR approach (with 30% of coverage) to the uLBDR mechanism achieving full coverage. This comes with a small performance cost, thus exhibiting the tradeoff between fault tolerance and performance. Power consumption, area, and delay estimates are also provided highlighting the efficiency of the mechanism. To do this, different router models (one for CMPs and one for MPSoCs) have been designed as a proof concept. © 2006 IEEE.",
	address = "445 Hoes Lane / P.O. Box 1331, Piscataway, NJ 08855-1331, United States",
	doi = "10.1109/TCAD.2011.2119150",
	issn = 02780070,
	journal = "IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems",
	key = "Fault tolerance",
	keywords = "Computer software selection and evaluation;Logic design;Microprocessor chips;Quality assurance;Telecommunication networks;Topology;",
	note = "Cost-efficient;Distributed routing;Efficient routing;High-performance computing;Irregular topology;Key component;Latency constraints;Many-core;Networks on chips;networks-on-chip;On chips;Performance costs;Power Consumption;Power-aware;Router model;routing;Routing table;Universal logic;",
	number = 4,
	pages = "534 - 547",
	title = "{C}ost-efficient on-chip routing implementations for {CMP} and {MPS}o{C} systems",
	url = "http://dx.doi.org/10.1109/TCAD.2011.2119150",
	volume = 30,
	year = 2011
}

Samuel Rodrigo, Jose Flich, Antoni Roca, S Medardoni, D Bertozzi, , Federico Silla and Jose Duato. Cost-Efficient On-Chip Routing Implementations for CMP and MPSoC Systems. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 30(4):534 - 47, 2011. URL, DOI BibTeX

@article{ 11874902,
	author = "Rodrigo, Samuel and Flich, Jose and Roca, Antoni and S. Medardoni and D. Bertozzi and , and Silla, Federico and Duato, Jose",
	abstract = "The high-performance computing domain is enriching with the inclusion of networks-on-chip (NoCs) as a key component of many-core (CMPs or MPSoCs) architectures. NoCs face the communication scalability challenge while meeting tight power, area, and latency constraints. Designers must address new challenges that were not present before. Defective components, the enhancement of application-level parallelism, or power-aware techniques may break topology regularity, thus, efficient routing becomes a challenge. This paper presents universal logic-based distributed routing (uLBDR), an efficient logic-based mechanism that adapts to any irregular topology derived from 2-D meshes, instead of using routing tables. uLBDR requires a small set of configuration bits, thus being more practical than large routing tables implemented in memories. Several implementations of uLBDR are presented highlighting the tradeoff between routing cost and coverage. The alternatives span from the previously proposed LBDR approach (with 30% of coverage) to the uLBDR mechanism achieving full coverage. This comes with a small performance cost, thus exhibiting the tradeoff between fault tolerance and performance. Power consumption, area, and delay estimates are also provided highlighting the efficiency of the mechanism. To do this, different router models (one for CMPs and one for MPSoCs) have been designed as a proof concept.",
	address = "USA",
	doi = "10.1109/TCAD.2011.2119150",
	issn = "0278-0070",
	journal = "IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems",
	keywords = "microprocessor chips;network routing;network-on-chip;",
	note = "cost-efficient on-chip routing implementations;chip multiprocessors;CMP;MPSoC Systems;many-core system-on-chip;networks-on-chip;communication scalability;latency constraints;area constraints;power constraints;application-level parallelism;power-aware techniques;topology regularity;universal logic-based distributed routing;logic-based mechanism;2D meshes;fault tolerance;fault performance;power consumption;",
	number = 4,
	pages = "534 - 47",
	title = "{C}ost-{E}fficient {O}n-{C}hip {R}outing {I}mplementations for {CMP} and {MPS}o{C} {S}ystems",
	url = "http://dx.doi.org/10.1109/TCAD.2011.2119150",
	volume = 30,
	year = 2011
}

Frank Olaf Sem-Jacobsen, Tor Skeie, Olav Lysne and Jose Duato. Dynamic fault tolerance in fat trees. IEEE Transactions on Computers 60(4):508 - 525, 2011. URL BibTeX

@article{ 20111013718747,
	author = "Frank Olaf Sem-Jacobsen and Tor Skeie and Olav Lysne and Duato, Jose",
	abstract = "Fat trees are a very common communication architecture in current large-scale parallel computers. The probability of failure in these systems increases with the number of components. We present a routing method for deterministically and adaptively routed fat trees, applicable to both distributed and source routing, that is able to handle several concurrent faults and that transparently returns to the original routing strategy once the faulty components have recovered. The method is local and dynamic, completely masking the fault from the rest of the system. It only requires a small extra functionality in the switches to handle rerouting packets around a fault. The method guarantees connectedness and deadlock and livelock freedom for up to k -1 benign simultaneous switch and/or link faults where k is half the number of ports in the switches. Our simulation experiments show a graceful degradation of performance as more faults occur. Furthermore, we demonstrate that for most fault combinations, our method will even be able to handle significantly more faults beyond the k -1 limit with high probability. {\&}copy; 2011 IEEE.",
	address = "445 Hoes Lane - P.O.Box 1331, Piscataway, NJ 08855-1331, United States",
	issn = 00189340,
	journal = "IEEE Transactions on Computers",
	key = "Quality assurance",
	keywords = "Computer architecture;Fault tolerance;",
	note = "Adaptive routing;deterministic routing;Dynamic faults;Fat trees;k-ary n-trees;",
	number = 4,
	pages = "508 - 525",
	title = "{D}ynamic fault tolerance in fat trees",
	url = "http://dx.doi.org/10.1109/TC.2010.97",
	volume = 60,
	year = 2011
}

F O Sem-Jacobsen, T Skeie, O Lysne and Jose Duato. Dynamic Fault Tolerance in Fat Trees. IEEE Transactions on Computers 60(4):508 - 25, 2011. URL, DOI BibTeX

@article{ 11837626,
	author = "F.O. Sem-Jacobsen and T. Skeie and O. Lysne and Duato, Jose",
	abstract = "Fat trees are a very common communication architecture in current large-scale parallel computers. The probability of failure in these systems increases with the number of components. We present a routing method for deterministically and adaptively routed fat trees, applicable to both distributed and source routing, that is able to handle several concurrent faults and that transparently returns to the original routing strategy once the faulty components have recovered. The method is local and dynamic, completely masking the fault from the rest of the system. It only requires a small extra functionality in the switches to handle rerouting packets around a fault. The method guarantees connectedness and deadlock and livelock freedom for up to k -1 benign simultaneous switch and/or link faults where k is half the number of ports in the switches. Our simulation experiments show a graceful degradation of performance as more faults occur. Furthermore, we demonstrate that for most fault combinations, our method will even be able to handle significantly more faults beyond the k -1 limit with high probability.",
	address = "USA",
	doi = "10.1109/TC.2010.97",
	issn = "0018-9340",
	journal = "IEEE Transactions on Computers",
	keywords = "failure analysis;fault tolerant computing;large-scale systems;network routing;parallel architectures;parallel machines;trees;",
	note = "dynamic fault tolerance;fat tree;communication architecture;large-scale parallel computer;failure probability;routing method;source routing;distributed routing;concurrent fault;rerouting packet handling;",
	number = 4,
	pages = "508 - 25",
	title = "{D}ynamic {F}ault {T}olerance in {F}at {T}rees",
	url = "http://dx.doi.org/10.1109/TC.2010.97",
	volume = 60,
	year = 2011
}

J Cano, J Flich, J Duato, M Coppola and R Locatelli. Efficient routing implementation in complex systems-on-chip. In Networks on Chip (NoCS), 2011 Fifth IEEE/ACM International Symposium on. May 2011, 1 -8. BibTeX

@conference{ 5948564,
	author = "J. Cano and J. Flich and J. Duato and M. Coppola and R. Locatelli",
	booktitle = "Networks on Chip (NoCS), 2011 Fifth IEEE/ACM International Symposium on",
	keywords = "SoC topologies;application-specific SoC;complex systems-on-chip;memory structures;routing tables;network topology;system-on-chip;",
	month = "may",
	pages = "1 -8",
	title = "{E}fficient routing implementation in complex systems-on-chip",
	year = 2011
}

Jose Duato, Antonio José Peña, Federico Silla, Rafael Mayo and Enrique S Quintana-Ort. Enabling CUDA acceleration within virtual machines using rCUDA. Proceedings of HiPC 2011, 2011. URL BibTeX

@article{ n/a,
	author = "Duato, Jose and Pe{\~n}a, Antonio Jos{\'e} and Silla, Federico and Rafael Mayo and Enrique S. Quintana-Ort",
	abstract = "The hardware and software advances of Graphics Processing Units (GPUs) have favored the develop- ment of GPGPU (General-Purpose Computation on GPUs) and its adoption in many scientific, engineering, and industrial areas. Thus, GPUs are increasingly being introduced in high-performance computing systems as well as in datacenters. On the other hand, virtualization technologies are also receiving rising interest in these domains, because of their many benefits on acquisition and maintenance savings. There are currently several works on GPU virtualization. However, there is no standard solution allowing access to GPGPU capabilities from virtual machine environments like, e.g., VMware, Xen, VirtualBox, or KVM. Such lack of a standard solution is delaying the integration of GPGPU into these domains.",
	journal = "Proceedings of HiPC 2011",
	keywords = "Virtual machine;rCUDA",
	note = "Clusters;CUDA;High performance computing;Virtualizations;",
	title = "{E}nabling {CUDA} acceleration within virtual machines using r{CUDA}",
	url = "http://www.hipc.org/hipc2011/program.php",
	year = 2011
}

Carles Hernández, Federico Silla and Jose Duato. Energy and Performance Efficient Thread Mapping in NoC-Based CMPs under Process Variations. In ICPP. 2011, 41-50. BibTeX

@conference{ dblp:conf/icpp/hernandezsd11,
	author = "Hern{\'a}ndez, Carles and Silla, Federico and Duato, Jose",
	booktitle = "ICPP",
	crossref = "DBLP:conf/icpp/2011",
	pages = "41-50",
	title = "{E}nergy and {P}erformance {E}fficient {T}hread {M}apping in {N}o{C}-{B}ased {CMP}s under {P}rocess {V}ariations",
	year = 2011
}

Carles Hernández, Federico Silla and Jose Duato. Energy and Performance Efficient Thread Mapping in NoC-Based CMPs under Process Variations. In Parallel Processing (ICPP), 2011 International Conference on. 2011, 41 -50. DOI BibTeX

@conference{ 6047171,
	author = "Hern{\'a}ndez, Carles and Silla, Federico and Duato, Jose",
	abstract = "Within-die process variation causes cores, memories, and network resources in NoC-based CMPs to present different speeds and leakage power. In this context, thread mapping strategies that consider the effects of process variability on chip resources arise as a suitable choice to maximize performance while energy consumption constraints are satisfied. However, other factors, as the location of memory controllers and the concurrent execution of several applications in the chip, can bound the possible benefits of such mapping strategies. In this paper we propose a mapping strategy, named as uniform regions, that takes variability effects into account when assigning application threads to cores in the chip. More specifically, uniform regions, in terms of operating frequency, that additionally present the highest available frequency, are selected so that the benefits of such a variation-aware mapping strategy in a NoC-based CMP are maximized. We additionally present two different ways of configuring the frequency and voltage of the cores in the selected region. The first one is intended to provide the maximum performance while keeping energy as low as possible, while the second one is much more for energy-aware. The first one reduces the execution time up to a 23 #x025; while reducing the energy up to 24 #x025; whereas the second one provides smaller speed ups while reduces energy up to 33 #x025;.",
	booktitle = "Parallel Processing (ICPP), 2011 International Conference on",
	doi = "10.1109/ICPP.2011.48",
	issn = "0190-3918",
	month = "sept.",
	pages = "41 -50",
	title = "{E}nergy and {P}erformance {E}fficient {T}hread {M}apping in {N}o{C}-{B}ased {CMP}s under {P}rocess {V}ariations",
	year = 2011
}

Carles Hernández, Antoni Roca, Jose Flich, Federico Silla and Jose Duato. Fault-Tolerant Vertical Link Design for Effective 3D Stacking. IEEE Computer Architecture Letters 99(RapidPosts), 2011. URL, DOI BibTeX

@article{ 10.1109/l-ca.2011.17,
	author = "Hern{\'a}ndez, Carles and Roca, Antoni and Flich, Jose and Silla, Federico and Duato, Jose",
	address = "Los Alamitos, CA, USA",
	doi = "10.1109/L-CA.2011.17",
	issn = "1556-6056",
	journal = "IEEE Computer Architecture Letters",
	number = "RapidPosts",
	publisher = "IEEE Computer Society",
	title = "{F}ault-{T}olerant {V}ertical {L}ink {D}esign for {E}ffective 3{D} {S}tacking",
	url = "http://doi.ieeecomputersociety.org/10.1109/L-CA.2011.17",
	volume = 99,
	year = 2011
}

Crispín Gomez, Maria E Gomez, Pedro Lopez and Jose Duato. How to reduce packet dropping in a bufferless NoC. Concurrency and Computation: Practice and Experience 23(1):86 - 99, 2011. URL, DOI BibTeX

@article{ 11723780,
	author = "Gomez, Crisp{\'i}n and Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "Networks on-chip (NoCs) interconnect the components located inside a chip. In multicore chips, NoCs have a strong impact on the overall system performance. NoC bandwidth is limited by the critical path delay. Recent works show that the critical path delay is heavily affected by switch port buffer size. Therefore, by removing buffers, switch clock frequency can be increased. Recently, a new switching technique for NoCs called Blind Packet Switching (BPS) has been proposed, which is based on removing the switch port buffers. Since buffers consume a high percentage of switch power and area, BPS not only improves performance but also reduces power and area. In BPS, as there are no buffers at the switch ports, packets cannot be stopped and stored on them. If contention arises packets are dropped and later reinjected, negatively affecting performance. In order to prevent packet dropping, some techniques based on resource replication have been proposed. In this paper, we propose some alternative and complementary techniques that do not rely on resource replication. By using them, packet dropping is highly reduced. In particular, packet dropping is completely removed for a very wide network traffic range. Moreover, network throughput is increased and packet latency is reduced. © 2010 John Wiley {\&} Sons, Ltd.",
	address = "UK",
	doi = "10.1002/cpe.1606",
	issn = "1532-0626",
	journal = "Concurrency and Computation: Practice and Experience",
	keywords = "buffer circuits;circuit switching;network-on-chip;",
	note = "packet dropping reduction;bufferless NoC;networks on-chip;critical path delay;switch clock frequency;blind packet switching;switch port buffers;network traffic range;",
	number = 1,
	pages = "86 - 99",
	title = "{H}ow to reduce packet dropping in a bufferless {N}o{C}",
	url = "http://dx.doi.org/10.1002/cpe.1606",
	volume = 23,
	year = 2011
}

Crispín Gomez, Maria E Gomez, Pedro Lopez and Jose Duato. How to reduce packet dropping in a bufferless NoC. Concurrency Computation Practice and Experience 23(1):86 - 99, 2011. URL BibTeX

@article{ 20105213526965,
	author = "Gomez, Crisp{\'i}n and Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "Networks on-chip (NoCs) interconnect the components located inside a chip. In multicore chips, NoCs have a strong impact on the overall system performance. NoC bandwidth is limited by the critical path delay. Recent works show that the critical path delay is heavily affected by switch port buffer size. Therefore, by removing buffers, switch clock frequency can be increased. Recently, a new switching technique for NoCs called Blind Packet Switching (BPS) has been proposed, which is based on removing the switch port buffers. Since buffers consume a high percentage of switch power and area, BPS not only improves performance but also reduces power and area. In BPS, as there are no buffers at the switch ports, packets cannot be stopped and stored on them. If contention arises packets are dropped and later reinjected, negatively affecting performance. In order to prevent packet dropping, some techniques based on resource replication have been proposed. In this paper, we propose some alternative and complementary techniques that do not rely on resource replication. By using them, packet dropping is highly reduced. In particular, packet dropping is completely removed for a very wide network traffic range. Moreover, network throughput is increased and packet latency is reduced. Copyright {\&}copy; 2010 John Wiley {\&}amp; Sons, Ltd.",
	address = "Southern Gate, Chichester, West Sussex, PO19 8SQ, United Kingdom",
	issn = 15320626,
	journal = "Concurrency Computation Practice and Experience",
	key = "Packet switching",
	keywords = "Signal filtering and prediction;",
	note = "buffer limitations;Buffer sizes;Clock frequency;Critical path delays;Multicore chips;Network throughput;Network traffic;On chips;Packet dropping;Packet latencies;Resource replication;Switch ports;Switch power;Switching techniques;",
	number = 1,
	pages = "86 - 99",
	title = "{H}ow to reduce packet dropping in a bufferless {N}o{C}",
	url = "http://dx.doi.org/10.1002/cpe.1606",
	volume = 23,
	year = 2011
}

Crispín Gomez, Maria E Gomez, Pedro Lopez and Jose Duato. How to reduce packet dropping in a bufferless NoC. Concurrency and Computation: Practice and Experience 23(1):86-99, 2011. URL, DOI BibTeX

@article{ dblp:journals/concurrency/requenagld11,
	author = "Gomez, Crisp{\'i}n and Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "Abstract Networks on-chip (NoCs) interconnect the components located inside a chip. In multicore chips, NoCs have a strong impact on the overall system performance. NoC bandwidth is limited by the critical path delay. Recent works show that the critical path delay is heavily affected by switch port buffer size. Therefore, by removing buffers, switch clock frequency can be increased. Recently, a new switching technique for NoCs called Blind Packet Switching (BPS) has been proposed, which is based on removing the switch port buffers. Since buffers consume a high percentage of switch power and area, BPS not only improves performance but also reduces power and area. In BPS, as there are no buffers at the switch ports, packets cannot be stopped and stored on them. If contention arises packets are dropped and later reinjected, negatively affecting performance. In order to prevent packet dropping, some techniques based on resource replication have been proposed. In this paper, we propose some alternative and complementary techniques that do not rely on resource replication. By using them, packet dropping is highly reduced. In particular, packet dropping is completely removed for a very wide network traffic range. Moreover, network throughput is increased and packet latency is reduced. Copyright © 2010 John Wiley {\&} Sons, Ltd.",
	doi = "10.1002/cpe.1606",
	issn = "1532-0634",
	journal = "Concurrency and Computation: Practice and Experience",
	keywords = "networks on-chip;buffer limitations;packet dropping reduction",
	number = 1,
	pages = "86-99",
	title = "{H}ow to reduce packet dropping in a bufferless {N}o{C}",
	url = "http://dx.doi.org/10.1002/cpe.1606",
	volume = 23,
	year = 2011
}

Blas Cuesta Sáez, Alberto Ros, Maria E Gomez, Antonio Robles and Jose Duato. Increasing the Effectiveness of Directory Caches by Deactivating Coherence for Private Memory Blocks. In 38th International Symposium on Computer Architecture (ISCA). June 2011, 93–103. URL BibTeX

@conference{ bcuesta-isca11,
	author = "Cuesta S{\'a}ez, Blas and Ros, Alberto and Gomez, Maria E. and Robles, Antonio and Duato, Jose",
	address = "San Jose (California)",
	booktitle = "38th International Symposium on Computer Architecture (ISCA)",
	isbn = "978-1-4503-0472-6",
	month = "jun",
	pages = "93--103",
	publisher = "Association for Computing Machinery (ACM)",
	title = "{I}ncreasing the {E}ffectiveness of {D}irectory {C}aches by {D}eactivating {C}oherence for {P}rivate {M}emory {B}locks",
	url = "http://skywalker.inf.um.es/~aros/papers/bcuesta-isca11.pdf",
	year = 2011
}

, Jose Flich, Antoni Roca and Jose Duato. PC-Mesh: A Dynamic Parallel Concentrated Mesh. In Parallel Processing (ICPP), 2011 International Conference on. 2011, 642 -651. DOI BibTeX

@conference{ 6047232,
	author = ", and Flich, Jose and Roca, Antoni and Duato, Jose",
	abstract = "We present a novel network on-chip topology, PC-Mesh (Parallel Concentrated Mesh), suitable for tiled CMP systems. The topology is built using four concentrated mesh (C-Mesh) networks and a new network interface able to inject packets through different networks. The goal of the new combined topology is to minimize the power consumption of the network when running applications exhibiting low traffic rates and maximize throughput when applications require high traffic rates. Thus, the topology is dynamically adjusted (switching on and off network components) with a proper injection algorithm, adapting itself to the network on-chip traffic requirements. The PC-Mesh network performs as a C-Mesh network (using one sub network) when the traffic is low obtaining large savings in power consumption. When the load network increases, new sub networks are opened and thus higher traffic rates are supported, thus providing comparable results as the mesh network. Additional benefits of the PC-Mesh network is its fault tolerance degree and the lower latency in terms of hops. An alternative PC-Mesh version is provided to optimize the fault-tolerance degree. Comparative results with detailed evaluations (in area, power, and delay) are provided both for the network interface and switches. Results demonstrate PC-Mesh is able to dynamically adapt to the current traffic situations. Experimental results with a system-level simulation platform (including the application being run and the operating system) are provided. Results show how the PC-Mesh network achieves the same results as the C-Mesh topology reducing execution time of applications by 20 #x025; as well as energy consumption by also 20 #x025;, when compared with the 2D-Mesh network topology. However, when challenged with higher traffic demands, PC-Mesh outperforms the C-Mesh network by achieving much lower execution time of applications and lower energy consumption. In some scenarios, execution time is reduced by a factor of 2 - - and power consumption by 50 #x025;.",
	booktitle = "Parallel Processing (ICPP), 2011 International Conference on",
	doi = "10.1109/ICPP.2011.21",
	issn = "0190-3918",
	month = "sept.",
	pages = "642 -651",
	title = "{PC}-{M}esh: {A} {D}ynamic {P}arallel {C}oncentrated {M}esh",
	year = 2011
}

Jose Duato, Antonio José Peña, Federico Silla, Rafael Mayo and Enrique S Quintana-Orti. Performance of CUDA Virtualized Remote GPUs in High Performance Clusters. In Parallel Processing (ICPP), 2011 International Conference on. 2011, 365 -374. DOI BibTeX

@conference{ 6047204,
	author = "Duato, Jose and Pe{\~n}a, Antonio Jos{\'e} and Silla, Federico and Rafael Mayo and Enrique S. Quintana-Orti",
	abstract = "In a previous work we presented the architecture of rCUDA, a middleware that enables CUDA remoting over a commodity network. That is, the middleware allows an application to use a CUDA-compatible Graphics Processor (GPU) installed in a remote computer as if it were installed in the computer where the application is being executed. This approach is based on the observation that GPUs in a cluster are not usually fully utilized, and it is intended to reduce the number of GPUs in the cluster, thus lowering the costs related with acquisition and maintenance while keeping performance close to that of the fully-equipped configuration. In this paper we model rCUDA over a series of high throughput networks in order to assess the influence of the performance of the underlying network on the performance of our virtualization technique. For this purpose, we analyze the traces of two different case studies over two different networks. Using this data, we calculate the expected performance for these same case studies over a series of high throughput networks, in order to characterize the expected behavior of our solution in high performance clusters. The estimations are validated using real 1 Gbps Ethernet and 40 Gbps InfiniBand networks, showing an error rate in the order of 1 #x025; for executions involving data transfers above 40 MB. In summary, although our virtualization technique noticeably increases execution time when using a 1 Gbps Ethernet network, it performs almost as efficiently as a local GPU when higher performance interconnects are used. Therefore, the small overhead incurred by our proposal because of the remote use of GPUs is worth the savings that a cluster configuration with less GPUs than nodes reports.",
	booktitle = "Parallel Processing (ICPP), 2011 International Conference on",
	doi = "10.1109/ICPP.2011.58",
	issn = "0190-3918",
	month = "sept.",
	pages = "365 -374",
	title = "{P}erformance of {CUDA} {V}irtualized {R}emote {GPU}s in {H}igh {P}erformance {C}lusters",
	year = 2011
}

Alesandro Strano, Carles Hernández, Federico Silla and Davide Bertozzi. Self-Calibrating Source Synchronous Communication for Delay Variation Tolerant GALS Network-on-Chip Design. International Journal of Embedded and Real-Time Communication Systems (IJERTCS) 2(4):20, October 2011. DOI BibTeX

@article{ 1947-317,
	author = "Alesandro Strano and Hern{\'a}ndez, Carles and Silla, Federico and Davide Bertozzi",
	doi = "doi:10.4018/jertcs.2011100101",
	issn = "1947-3176",
	journal = "International Journal of Embedded and Real-Time Communication Systems (IJERTCS)",
	month = "October",
	number = 4,
	pages = 20,
	title = "{S}elf-{C}alibrating {S}ource {S}ynchronous {C}ommunication for {D}elay {V}ariation {T}olerant {GALS} {N}etwork-on-{C}hip {D}esign",
	volume = 2,
	year = 2011
}

, Jose Flich, Jose Duato, H Eberle and W Olesinski. Towards an Efficient NoC Topology through Multiple Injection Ports. In Digital System Design (DSD), 2011 14th Euromicro Conference on. 2011, 165 -172. DOI BibTeX

@conference{ 6037406,
	author = ", and Flich, Jose and Duato, Jose and H. Eberle and W. Olesinski",
	abstract = "In this paper, we present a flexible network on-chip topology: NR-Mesh (Nearest neighbor Mesh). The topology gives an end node the choice to inject a message through different neighboring routers, thereby reducing hop count and saving latency. At the receiver side, a message may be delivered to the end node through different routers, thus reducing hop count further and increasing flexibility when routing messages. This flexibility allows for maximizing network components to be in switch off mode, thus enabling power aware routing algorithms. Additional benefits are reduced congestion/contention levels in the network, support for efficient broadcast operations, savings in power consumption, and partial fault-tolerance. Our second contribution is a power management technique for the adaptive routing. This technique turns router ports and their attached links on and off depending on traffic conditions. The power management technique is able to achieve significant power savings when there is low traffic in the network. We further compare the new topology with the 2D-Mesh, using either deterministic or adaptive routing. When compared with the 2D-Mesh using deterministic routing, executing real applications in a full system simulation platform, the NR-Mesh topology using adaptive routing is able to obtain significant savings, 7% of reduction in execution time and 75% in energy consumption at the network on average for a 16-Node CMP System. Similar numbers are achieved for a 32-Node CMP system.",
	booktitle = "Digital System Design (DSD), 2011 14th Euromicro Conference on",
	doi = "10.1109/DSD.2011.25",
	keywords = "CMP system;NR-mesh topology;NoC topology;adaptive routing;broadcast operation;congestion level;contention level;deterministic routing;energy consumption;fault-tolerance;flexible network on-chip topology;hop count;injection port;nearest neighbor mesh;neigh",
	month = "31 2011-sept. 2",
	pages = "165 -172",
	title = "{T}owards an {E}fficient {N}o{C} {T}opology through {M}ultiple {I}njection {P}orts",
	year = 2011
}

F Trivino, J Sanchez, F J Alfaro and Jose Flich. Virtualizing network-on-chip resources in chip-multiprocessors. Microprocessors and Microsystems 35(2):230 - 45, 2011. URL, DOI BibTeX

@article{ 11839233,
	author = "F. Trivino and J. Sanchez and F.J. Alfaro and Flich, Jose",
	abstract = "The number of cores on a single silicon chip is rapidly growing and chips containing tens or even hundreds of identical cores are expected in the future. To take advantage of multicore chips, multiple applications will run simultaneously. As a consequence, the traffic interferences between applications increases and the performance of individual applications can be seriously affected.In this paper, we improve the individual application performance when several applications are simultaneously running. This proposal is based on the virtualization concept and allows us to reduce execution time and network latency in a significant percentage. [All rights reserved Elsevier].",
	address = "Netherlands",
	doi = "10.1016/j.micpro.2010.10.001",
	issn = "0141-9331",
	journal = "Microprocessors and Microsystems",
	keywords = "multiprocessing systems;network-on-chip;",
	note = "virtualizing network-on-chip resources;chip multiprocessors;single silicon chip;identical cores;multicore chips;traffic interferences;virtualization concept;",
	number = 2,
	pages = "230 - 45",
	title = "{V}irtualizing network-on-chip resources in chip-multiprocessors",
	url = "http://dx.doi.org/10.1016/j.micpro.2010.10.001",
	volume = 35,
	year = 2011
}

Alberto Ros, M E Acacio and J M Garcia. A Direct Coherence Protocol for Many-Core Chip Multiprocessors. Parallel and Distributed Systems, IEEE Transactions on 21(12):1779 -1792, 2010. URL, DOI BibTeX

@article{ 5432165,
	author = "Ros, Alberto and M.E. Acacio and J.M. Garcia",
	abstract = "Future many-core CMP designs that will integrate tens of processor cores on-chip will be constrained by area and power. Area constraints make impractical the use of a bus or a crossbar as the on-chip interconnection network, and tiled CMPs organized around a direct interconnection network will probably be the architecture of choice. Power constraints make impractical to rely on broadcasts (as, for example, Token-CMP does) or any other brute-force method for keeping cache coherence, and directory-based cache coherence protocols are currently being employed. Unfortunately, directory protocols introduce indirection to access directory information, which negatively impacts performance. In this work, we present DiCo-CMP, a novel cache coherence protocol especially suited to future many-core tiled CMP architectures. In DiCo-CMP, the task of storing up-to-date sharing information and ensuring ordered accesses for every memory block is assigned to the cache that must provide the block on a miss. Therefore, DiCo-CMP reduces the miss latency compared to a directory protocol by sending requests directly to the cache that provides the block in a cache miss. These latency reductions result in improvements in execution time of up to 6 percent, on average, over a directory protocol. In comparison with Token-CMP, our protocol only sends one request message for each cache miss, as such is able to reduce network traffic by 43 percent.",
	doi = "10.1109/TPDS.2010.43",
	issn = "1045-9219",
	journal = "Parallel and Distributed Systems, IEEE Transactions on",
	keywords = "DiCo-CMP;area constraints;brute-force method;direct interconnection network;directory-based cache coherence protocols;many-core chip multiprocessors;many-core tiled CMP architectures;on-chip interconnection network;token-CMP;cache storage;coherence;microp",
	month = "dec.",
	number = 12,
	pages = "1779 -1792",
	title = "{A} {D}irect {C}oherence {P}rotocol for {M}any-{C}ore {C}hip {M}ultiprocessors",
	url = "http://dx.doi.org/10.1109/TPDS.2010.43",
	volume = 21,
	year = 2010
}

Antoni Roca, Jose Flich, Federico Silla and Jose Duato. A Latency-Efficient Router Architecture for CMP Systems. In Digital System Design: Architectures, Methods and Tools (DSD), 2010 13th Euromicro Conference on. 2010, 165 -172. URL, DOI BibTeX

@conference{ 5615623,
	author = "Roca, Antoni and Flich, Jose and Silla, Federico and Duato, Jose",
	abstract = "As technology advances, the number of cores in Chip Multi Processor systems (CMPs) and Multi Processor Systems-on-Chips (MPSoCs) keeps increasing. Current test chips and products reach tens of cores, and it is expected to reach hundreds of cores in the near future. Such complexity demands for an efficient network-on-chip (NoC). The common choice to build such networks is the 2D mesh topology (as it matches the regular tile-based design) and the Dimension-Order Routing (DOR) algorithm (because its simplicity). The network in such systems must provide sustained throughput and ultra low latencies. One of the key components in the network is the router, and thus, it plays a major role when designing for such performance levels. In this paper we propose a new pipelined router design focused in reducing the router latency. As a first step we identify the router components that take most of the critical path, and thus limit the router frequency. In particular, the arbiter is the one limiting the performance of the router. Based on this fact, we simplify the arbiter logic by using multiple smaller arbiters. The initial set of requests in the initial arbiter is then distributed over the smaller arbiters that operate in parallel. With this design procedure, and with a proper internal router organization, different router architectures are evolved. All of them enable the use of smaller arbiters in parallel by replicating ports and assuming the use of the DOR algorithm. The net result of such changes is a faster router. Preliminary results demonstrate a router latency reduction ranging from 10 #x025; to 21 #x025; with an increase of the router area. Network latency is reduced in a range from 11% to 15%.",
	booktitle = "Digital System Design: Architectures, Methods and Tools (DSD), 2010 13th Euromicro Conference on",
	doi = "10.1109/DSD.2010.42",
	isbn = "978-1-4244-7839-2",
	keywords = "arbiter design;low latency router;network-on-chip;router architecture;router design",
	month = "sept.",
	pages = "165 -172",
	title = "{A} {L}atency-{E}fficient {R}outer {A}rchitecture for {CMP} {S}ystems",
	url = "http://dx.doi.org/10.1109/DSD.2010.42",
	year = 2010
}

Carles Hernández, Federico Silla and Jose Duato. A Methodology for the Characterization of Process Variation in NoC Links. In 2010 Design, Automation & Test in Europe Conference & Exhibition (DATE 2010). March 2010, 685-690. URL BibTeX

@conference{ 11283352,
	author = "Hern{\'a}ndez, Carles and Silla, Federico and Duato, Jose",
	abstract = "Associated with the ever growing integration scales is the increase in process variability. In the context of network-on-chip, this variability affects the maximum frequency that could be sustained by each link that interconnects two cores in a chip multiprocessor. In this paper we present a methodology to model delay variations in NoC links. We also show its application to several technologies, namely 45nm, 32nm, 22nm, and 16nm. Simulation results show that conclusions about variability greatly depend on the implementation context.",
	address = "Dresden, Germany",
	booktitle = "2010 Design, Automation {\&} Test in Europe Conference {\&} Exhibition (DATE 2010)",
	isbn = "978-3-9810801-6-2",
	journal = "2010 Design, Automation {\&}amp; Test in Europe Conference {\&}amp; Exhibition (DATE 2010)",
	keywords = "multiprocessor interconnection networks;network-on-chip;",
	month = "March",
	note = "process variation;NoC Links;network-on-chip;chip multiprocessor;process variability;",
	pages = "685-690",
	publisher = "EDDA",
	title = "{A} {M}ethodology for the {C}haracterization of {P}rocess {V}ariation in {N}o{C} {L}inks",
	url = "http://www.date-conference.com/proceedings/PAPERS/2010/DATE10/PDFFILES/06.3_2.PDF",
	year = 2010
}

Héctor Montaner, Federico Silla and Jose Duato. A practical way to extend shared memory support beyond a motherboard at low cost. In Proceedings of the 19th ACM International Symposium on High Performance Distributed Computing. June 2010, 155-166. URL, DOI BibTeX

@conference{ montaner:2010:pwe:1851476.1851495,
	author = "Montaner, H{\'e}ctor and Silla, Federico and Duato, Jose",
	abstract = "Improvements in parallel computing hardware usually involve increments in the number of available resources for a given application such as the number of computing cores and the amount of memory. In the case of shared-memory computers, the increase in computing resources and available memory is usually constrained by the coherency protocol, whose overhead rises with system size, limiting the scalability of the final system. In this paper we propose an efficient and cost-effective way to increase the memory available for a given application by leveraging free memory in other computers in the cluster. Our proposal is based on the observation that many applications benefit from having more memory resources but do not require more computing cores, thus reducing the requirements for cache coherency and allowing a simpler implementation and better scalability. Simulation results show that, when additional mechanisms intended to hide remote memory latency are used, execution time of applications that use our proposal is similar to the time required to execute them in a computer populated with enough local memory, thus validating the feasibility of our proposal. We are currently building a prototype that implements our ideas.",
	address = "Chicago, Illinois",
	booktitle = "Proceedings of the 19th ACM International Symposium on High Performance Distributed Computing",
	doi = "10.1145/1851476.1851495",
	isbn = "978-1-60558-942-8",
	keywords = "memory;",
	month = "June",
	pages = "155-166",
	publisher = "ACM",
	series = "HPDC '10",
	title = "{A} practical way to extend shared memory support beyond a motherboard at low cost",
	url = "http://doi.acm.org/10.1145/1851476.1851495",
	year = 2010
}

Joan-Lluis Ferrer, Elvira Baydal, Antonio Robles, Pedro Lopez and Jose Duato. A Scalable and Early Congestion Management Mechanism for MINs. In Proceedings of the 18th Euromicro Conference on Parallel, Distributed and Network-Based Processing, PDP 2010. 2010, 43 - 50. URL BibTeX

@conference{ 11260741,
	author = "Ferrer, Joan-Lluis and Baydal, Elvira and Robles, Antonio and Lopez, Pedro and Duato, Jose",
	abstract = "Several packet marking-based mechanisms have been proposed to manage congestion in multistage interconnection networks. One of them, the MVCM mechanism obtains very good results for different network configurations and traffic loads. However, as MVCM applies full virtual output queuing at origin, its memory requirements may jeopardize its scalability. Additionally, the applied packet marking technique introduces certain delay to detect congestion. In this paper, we propose and evaluate the Scalable Early Congestion Management mechanism which eliminates the drawbacks exhibited by MVCM. The new mechanism replaces the full virtual output queuing at origin by either a partial virtual output queuing or a shared buffer, in order to reduce its memory requirements, thus making the mechanism scalable. Also, it applies an improved packet marking technique based on marking packets at output buffers regardless of their marking at input buffers, which simplifies the marking technique, allowing also a sooner detection of the root of a congestion tree.",
	address = "Piscataway, NJ, USA",
	booktitle = "Proceedings of the 18th Euromicro Conference on Parallel, Distributed and Network-Based Processing, PDP 2010",
	journal = "Proceedings of the 18th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP 2010)",
	keywords = "multistage interconnection networks;",
	note = "packet marking based mechanisms;multistage interconnection networks;MVCM mechanism;virtual output queuing;scalable early congestion management mechanism;shared buffer;",
	pages = "43 - 50",
	title = "{A} {S}calable and {E}arly {C}ongestion {M}anagement {M}echanism for {MIN}s",
	url = "http://dx.doi.org/10.1109/PDP.2010.36",
	year = 2010
}

M Serrano, Julio Sahuquillo, Houcine Hassan Mohamed, Salvador Petit and Jose Duato. A Scheduling Heuristic to Handle Local and Remote Memory in Cluster Computers. In High Performance Computing and Communications (HPCC), 2010 12th IEEE International Conference on. 2010, 35 -42. URL, DOI BibTeX

@conference{ 5581321,
	author = "M. Serrano and Sahuquillo, Julio and Mohamed, Houcine Hassan and Petit, Salvador and Duato, Jose",
	abstract = "In cluster computers, RAM memory is spread among the motherboards hosting the running applications. In these systems, it is common to constrain the memory address space of a given processor to the local motherboard. Constraining the system in this way is much cheaper than using a full-fledged shared memory implementation among motherboards. However, in this case, memory usage might widely differ among motherboards depending on the memory requirements of the applications running on each motherboard. In this context, if an application requires a huge quantity of RAM memory, the only feasible solution is to increase the amount of available memory in its local motherboard, even if the remaining ones are underused. Nevertheless, beyond a certain memory size, this memory budget increase becomes prohibitive. In this paper, we assume that the Remote Memory Access hardware used in a Hyper Transport based system allows applications to allocate the required memory from remote motherboards. We also analyze how the distribution of memory accesses among different memory locations (local or remote) impact on performance. Finally, an heuristic is devised to schedule local and remote memory among applications according to their requirements, and considering quality of service constraints.",
	booktitle = "High Performance Computing and Communications (HPCC), 2010 12th IEEE International Conference on",
	doi = "10.1109/HPCC.2010.75",
	isbn = "978-1-4244-8335-8",
	keywords = "hyper transport based system;local memory handling;random access memory;remote memory access hardware;remote memory handling;remote motherboards;scheduling heuristic;random-access storage;scheduling;storage management;",
	month = "sept.",
	pages = "35 -42",
	title = "{A} {S}cheduling {H}euristic to {H}andle {L}ocal and {R}emote {M}emory in {C}luster {C}omputers",
	url = "http://dx.doi.org/10.1109/HPCC.2010.75",
	year = 2010
}

Samuel Rodrigo, Jose Flich, Antoni Roca, S Medardoni, D Bertozzi, , Federico Silla and Jose Duato. Addressing Manufacturing Challenges with Cost-Efficient Fault Tolerant Routing. In Networks-on-Chip (NOCS), 2010 Fourth ACM/IEEE International Symposium on. May 2010, 25 -32. URL, DOI BibTeX

@conference{ 5507564,
	author = "Rodrigo, Samuel and Flich, Jose and Roca, Antoni and S. Medardoni and D. Bertozzi and , and Silla, Federico and Duato, Jose",
	abstract = "The high-performance computing domain is enriching with the inclusion of Networks-on-chip (NoCs) as a key component of many-core (CMPs or MPSoCs) architectures. NoCs face the communication scalability challenge while meeting tight power, area and latency constraints. Designers must address new challenges that were not present before. Defective components, the enhancement of application-level parallelism or power-aware techniques may break topology regularity, thus, efficient routing becomes a challenge.In this paper, uLBDR (Universal Logic-Based Distributed Routing) is proposed as an efficient logic-based mechanism that adapts to any irregular topology derived from 2D meshes, being an alternative to the use of routing tables (either at routers or at end-nodes). uLBDR requires a small set of configuration bits, thus being more practical than large routing tables implemented in memories. Several implementations of uLBDR are presented highlighting the trade-off between routing cost and coverage. The alternatives span from the previously proposed LBDR approach (with 30% of coverage) to the uLBDR mechanism achieving full coverage. This comes with a small performance cost, thus exhibiting the trade-off between fault tolerance and performance.",
	booktitle = "Networks-on-Chip (NOCS), 2010 Fourth ACM/IEEE International Symposium on",
	doi = "10.1109/NOCS.2010.12",
	keywords = "NoC;addressing manufacturing challenges;application level parallelism;cost efficient fault tolerant routing;logic based mechanism;networks-on-chip;power aware techniques;universal logic based distributed routing;network routing;network topology;network-on",
	month = "may",
	pages = "25 -32",
	title = "{A}ddressing {M}anufacturing {C}hallenges with {C}ost-{E}fficient {F}ault {T}olerant {R}outing",
	url = "http://dx.doi.org/10.1109/NOCS.2010.12",
	year = 2010
}

Jose Duato, Francisco D Igual, Rafael Mayo, Antonio José Peña, Enrique S Quintana-Orti and Federico Silla. An efficient implementation of GPU virtualization in high performance clusters. In Euro-Par 2009 – Parallel Processing Workshops 6043 LNCS. 2010, 385 - 394. URL BibTeX

@conference{ 20102913080626,
	author = "Duato, Jose and Francisco D. Igual and Rafael Mayo and Pe{\~n}a, Antonio Jos{\'e} and Enrique S. Quintana-Orti and Silla, Federico",
	abstract = "Current high performance clusters are equipped with high bandwidth/low latency networks, lots of processors and nodes, very fast storage systems, etc. However, due to economical and/or power related constraints, in general it is not feasible to provide an accelerating co-processor -such as a graphics processor (GPU)- per node. To overcome this, in this paper we present a GPU virtualization middleware, which makes remote CUDA-compatible GPUs available to all the cluster nodes. The software is implemented on top of the sockets application programming interface, ensuring portability over commodity networks, but it can also be easily adapted to high performance networks. © 2010 Springer-Verlag.",
	address = "Delft, Netherlands",
	booktitle = "Euro-Par 2009 – Parallel Processing Workshops",
	issn = "0302-9743",
	journal = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
	key = "Data storage equipment",
	keywords = "Application programming interfaces;Computer graphics equipment;Computer software portability;Middleware;Nanotechnology;Program processors;",
	note = "Cluster nodes;Co-processors;Efficient implementation;Graphics processor;High performance cluster;High performance computing;High performance networks;Storage systems;Virtualizations;",
	pages = "385 - 394",
	title = "{A}n efficient implementation of {GPU} virtualization in high performance clusters",
	url = "http://dx.doi.org/10.1007/978-3-642-14122-5_44",
	volume = "6043 LNCS",
	year = 2010
}

Diana B Rayo, Julio Sahuquillo, Houcine Hassan Mohamed, Salvador Petit and Jose Duato. Balancing Task Resource Requirements in Embedded Multithreaded Multicore Processors to Reduce Power Consumption. In Proceedings of the 18th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP 2010). February 2010, 200 - 4. URL, DOI BibTeX

@conference{ 11260697,
	author = "Rayo, Diana B. and Sahuquillo, Julio and Mohamed, Houcine Hassan and Petit, Salvador and Duato, Jose",
	abstract = "Power consumption is a major design issue in modern microprocessors. Hence, power reduction techniques, like dynamic voltage scaling (DVS), are being widely implemented. Unfortunately, they impact on the task execution time so difficulting schedulability of hard real-time applications. To deal with this problem, this paper proposes a power-aware scheduler for coarse-grain embedded multicore processors implementing global DVS. To this end, this work presents two heuristics, namely Balanced Memory and Balanced CPU, which distribute the task set among cores focusing on resource utilization. Results show that with respect to a system not implementing DVS, two or five DVS levels achieve energy savings by about 35% or 51%, respectively.",
	booktitle = "Proceedings of the 18th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP 2010)",
	doi = "10.1109/PDP.2010.64",
	isbn = "978-1-4244-5672-7",
	issn = "1066-6192",
	journal = "Proceedings of the 18th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP 2010)",
	keywords = "microprocessor chips;multi-threading;power consumption;scheduling;",
	month = "Feb",
	note = "task resource requirements;embedded multithreaded multicore processors;power consumption reduction;power reduction techniques;dynamic voltage scaling;global DVS;",
	pages = "200 - 4",
	title = "{B}alancing {T}ask {R}esource {R}equirements in {E}mbedded {M}ultithreaded {M}ulticore {P}rocessors to {R}educe {P}ower {C}onsumption",
	url = "http://dx.doi.org/10.1109/PDP.2010.64",
	year = 2010
}

Teresa Nachiondo, Jose Flich and Jose Duato. Buffer Management Strategies to Reduce HoL Blocking. Parallel and Distributed Systems, IEEE Transactions on 21(6):739 - 753, June 2010. URL, DOI BibTeX

@article{ 4815231,
	author = "Nachiondo, Teresa and Flich, Jose and Duato, Jose",
	abstract = "Congestion management is likely to become a critical issue in interconnection networks, as increasing power consumption and cost concerns lead to improve the efficiency of network resources. In previous configurations, networks were usually overdimensioned and underutilized. In a smaller network, however, contention is more likely to happen and blocked packets introduce head-of-line (HoL) blocking to the rest of packets spreading congestion quickly. The best-known solution to HoL blocking is Virtual Output Queues (VOQs). However, the cost of implementing VOQs increases quadratically with the number of output ports in the network, thus, being unpractical. Therefore, a more scalable and cost-effective solution is required to reduce or eliminate HoL blocking. In this paper, we present methodologies, referred to as Destination-Based Buffer Management (DBBM), to reduce/eliminate the HoL blocking effect on interconnection networks. DBBM efficiently uses the resources (mainly memory queues) of the network. These methodologies are comprehensively evaluated in terms of throughput, scalability and fairness. Results show that the use of the DBBM strategy with a reduced number of queues at each switch is able to obtain roughly the same throughput as the VOQ mechanism. Moreover, all the proposed strategies are designed in such a way that can be used in any switch architecture. We compare DBBM with RECN, a sophisticated mechanism that eliminates HoL blocking in congestion situations. Our mechanism is able to achieve almost the same performance with very low logic requirements (in contrast with RECN).",
	doi = "10.1109/TPDS.2009.63",
	issn = "1045-9219",
	journal = "Parallel and Distributed Systems, IEEE Transactions on",
	keywords = "computer network management; quality of service; storage management; telecommunication congestion control",
	month = "June",
	number = 6,
	pages = "739 - 753",
	title = "{B}uffer {M}anagement {S}trategies to {R}educe {H}o{L} {B}locking",
	url = "http://dx.doi.org/10.1109/TPDS.2009.63",
	volume = 21,
	year = 2010
}

Jesus Escudero-Sahuquillo, Pedro J Garcia, Francisco J Quiles, Jose Flich and Jose Duato. Cost-Effective Congestion Management for Interconnection Networks Using Distributed Deterministic Routing. In 16th International Conference on Parallel and Distributed Systems (ICPADS 2010). December 2010. BibTeX

@conference{ icpads2010,
	author = "Jesus Escudero-Sahuquillo and Pedro J. Garcia and Francisco J. Quiles and Flich, Jose and Duato, Jose",
	abstract = "The Interconnection networks are essential elements in current computing systems. For this reason, achieving the best network performance, even in congestion situations, has been a primary goal in recent years. In that sense, there exist several techniques focused on eliminating the main negative effect of congestion: the Head of Line (HOL) blocking. One of the most successful HOL blocking elimination techniques is RECN, which can be applied in source routing networks. FBICM follows the same approach as RECN, but it has been developed for distributed deterministic routing networks. Although FBICM effectively eliminates HOL blocking, it requires too much resources to be implemented. In this paper we present a new FBICM version, based on a new organization of switch memory resources, that significantly reduces the required silicon area, complexity and cost. Moreover, we present new results about FBICM, in network topologies not yet analyzed. From the experiment results we can conclude that a far less complex and feasible FBICM implementation can be achieved by using the proposed improvements, while not losing efficiency.",
	address = "Shanghai, China",
	booktitle = "16th International Conference on Parallel and Distributed Systems (ICPADS 2010)",
	keywords = "Deterministic Routing; Congestion Management; Head-Of-Line Blocking;",
	month = "December",
	title = "{C}ost-{E}ffective {C}ongestion {M}anagement for {I}nterconnection {N}etworks {U}sing {D}istributed {D}eterministic {R}outing",
	year = 2010
}

Ricardo Fernandez-Pascual, Jose M Garcia, Manuel E Acacio and Jose Duato. Dealing with transient faults in the interconnection network of CMPs at the cache coherence level. IEEE Transactions on Parallel and Distributed Systems 21(8):1117 - 1131, 2010. URL BibTeX

@article{ 20102713062175,
	author = "Ricardo Fernandez-Pascual and Jose M. Garcia and Manuel E. Acacio and Duato, Jose",
	abstract = "The importance of transient faults is predicted to grow due to current technology trends of increased scale of integration. One of the components that will be significantly affected by transient faults is the interconnection network of chip multiprocessors (CMPs). To deal efficiently with these faults and differently from other authors, we propose to use fault-tolerant cache coherence protocols that ensure the correct execution of programs when not all messages are correctly delivered. We describe the extensions made to a directory-based cache coherence protocol to provide fault tolerance and provide a modified set of token counting rules which are useful to design fault-tolerant token-based cache coherence protocols. We compare the directory-based fault-tolerant protocol with a token-based fault-tolerant one. We also show how to adjust the fault tolerance parameters to achieve the desired level of fault tolerance and measure the overhead achieved to be able to support very high fault rates. Simulation results using a set of scientific, multimedia, and commercial applications show that the fault tolerance measures have virtually no impact on execution time with respect to a non-fault-tolerant protocol. Additionally, our protocols can support very high rates of transient faults at the cost of slightly increased network traffic. {\&}copy; 2006 IEEE.",
	address = "445 Hoes Lane - P.O.Box 1331, Piscataway, NJ 08855-1331, United States",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Quality assurance",
	keywords = "Fault tolerance;Interconnection networks;Multiprocessing systems;Network protocols;Packet networks;",
	note = "cache coherence;Cache coherence protocols;Chip Multiprocessor;Commercial applications;Current technology;Design faults;Execution time;Fault rates;Fault tolerant protocols;Fault-tolerant;High rate;Network traffic;Scale of integration;Simulation result;Transient faults;",
	number = 8,
	pages = "1117 - 1131",
	title = "{D}ealing with transient faults in the interconnection network of {CMP}s at the cache coherence level",
	url = "http://dx.doi.org/10.1109/TPDS.2009.148",
	volume = 21,
	year = 2010
}

D Flich J.; Bertozzi (ed.). Designing Network On-Chip Architectures in the Nanoscale Era. CRC Press, December 2010. URL BibTeX

@book{ 365336,
	author = "Gilabert, Francisco and Silla, Federico and Gomez, Maria E. and Lodde, Mario and Roca, Antoni and Flich, Jose and Duato, Jose and Hern{\'a}ndez, Carles and Rodrigo, Samuel",
	abstract = "Going beyond isolated research ideas and design experiences, Designing Network On-Chip Architectures in the Nanoscale Era covers the foundations and design methods of network on-chip (NoC) technology. The contributors draw on their own lessons learned to provide strong practical guidance on various design issues. Exploring the design process of the network, the first part of the book focuses on basic aspects of switch architecture and design, topology selection, and routing implementation. In the second part, contributors discuss their experiences in the industry, offering a roadmap to recent products. They describe Tilera’s TILE family of multicore processors, novel Intel products and research prototypes, and the TRIPS operand network (OPN). The last part reveals state-of-the-art solutions to hardware-related issues and explains how to efficiently implement the programming model at the network interface. In the appendix, the microarchitectural details of two switch architectures targeting multiprocessor system-on-chips (MPSoCs) and chip multiprocessors (CMPs) can be used as an experimental platform for running tests. A stepping stone to the evolution of future chip architectures, this volume provides a how-to guide for designers of current NoCs as well as designers involved with 2015 computing platforms. It cohesively brings together fundamental design issues, alternative design paradigms and techniques, and the main design tradeoffs—consistently focusing on topics most pertinent to real-world NoC designers.",
	editor = "Flich, J.; Bertozzi, D.",
	isbn = 9781439837108,
	keywords = "Network on chip;Chip Architectures;",
	month = "December",
	publisher = "CRC Press",
	title = "{D}esigning {N}etwork {O}n-{C}hip {A}rchitectures in the {N}anoscale {E}ra",
	url = "http://www.crcpress.com/product/isbn/9781439837108",
	year = 2010
}

Alberto Ros, Blas Cuesta Sáez, Ricardo Fernández-Pascual, Maria E Gomez, Manuel E Acacio, Antonio Robles, José M García and Jose Duato. EMC^2: Extending Magny-Cours Coherence for Large-Scale Servers. In 17th Int'l Conference on High Performance Computing (HiPC) In Press, Accepted. December 2010. BibTeX

@conference{ aros-hipc10,
	author = "Ros, Alberto and Cuesta S{\'a}ez, Blas and Ricardo Fern{\'a}ndez-Pascual and Gomez, Maria E. and Manuel E. Acacio and Robles, Antonio and Jos{\'e} M. Garc{\'i}a and Duato, Jose",
	abstract = "The demand of larger and more powerful highperformance shared-memory servers is growing over the last few years. To meet this need, AMD has recently launched the twelve-core Magny-Cours processors. They include a directory cache (Probe Filter) that increases the scalability of the coherence protocol applied by Opterons, based on coherent HyperTransport interconnect (cHT). cHT limits up to 8 the number of nodes that can be addressed. Recent High Node Count HT specification overcomes this limitation. However, the 3-bit pointer used by the Probe Filter prevents Magny-Cours-based servers from being built beyond 8 nodes. In this paper, we propose and develop an external logic to extend the coherence domain of Magny-Cours processors beyond the 8-node limit while maintaining the advantages provided by the Probe Filter. Evaluation results for up to a 32-node system show how the performance offered by our solution scales with the increment in the number of nodes, enhancing the Probe Filter effectiveness by filtering additional messages. Particularly, we reduce runtime by 47% in a 32-die system respect to the 8-die Magny-Cours system.",
	address = "Goa, India",
	booktitle = "17th Int'l Conference on High Performance Computing (HiPC)",
	month = "December",
	title = "{EMC}^2: {E}xtending {M}agny-{C}ours {C}oherence for {L}arge-{S}cale {S}ervers",
	volume = "In Press, Accepted",
	year = 2010
}

P Morillo, S Rueda, J M Orduna and Jose Duato. Ensuring the performance and scalability of peer-to-peer distributed virtual environments. In Future Generation Computer Systems 26(7). 2010, 905 - 915. URL BibTeX

@conference{ 20103413166817,
	author = "P. Morillo and S. Rueda and J.M. Orduna and Duato, Jose",
	abstract = "Large scale distributed virtual environments (DVEs) have become a major trend in distributed applications. Peer-to-peer (P2P) architectures have been proposed as an efficient and truly scalable solution for these kinds of systems. However, in order to design efficient P2P DVEs these systems must be characterized, measuring the impact of different client behavior on system performance. This paper presents the experimental characterization of P2P DVEs. The results show that the saturation of a given client has an exclusive effect on the surrounding clients in the virtual world, having no noticeable effect at all on the rest of clients. Nevertheless, the interactions among clients that can take place in this types of systems can lead to the temporal saturation of an unbounded number of clients, thus limiting the performance of P2P DVEs. In this paper, we also discuss and propose a technique for avoiding the saturation of the client computers in P2P DVEs. The evaluation results show that the performance and the scalability of P2P DVEs are significantly improved. These results can be used as the basis for an efficient design of P2P DVEs. © 2010 Elsevier B.V. All rights reserved.",
	address = "P.O. Box 211, Amsterdam, 1000 AE, Netherlands",
	booktitle = "Future Generation Computer Systems",
	journal = "Future Generation Computer Systems",
	key = "Peer to peer networks",
	keywords = "Adaptive filtering;Distributed computer systems;Scalability;Virtual reality;",
	note = "Distributed applications;Distributed Virtual Environments;Efficient designs;Evaluation results;Experimental characterization;Peer to peer;Peer-to-peer architectures;Performance evaluation;Scalable solution;Virtual worlds;",
	number = 7,
	pages = "905 - 915",
	title = "{E}nsuring the performance and scalability of peer-to-peer distributed virtual environments",
	url = "http://dx.doi.org/10.1016/j.future.2010.03.003",
	volume = 26,
	year = 2010
}

Héctor Montaner, Federico Silla, H Fröning and Jose Duato. Getting Rid of Coherency Overhead for Memory-Hungry Applications. In Cluster Computing (CLUSTER), 2010 IEEE International Conference on. 2010, 48 -57. URL, DOI BibTeX

@conference{ 5600323,
	author = {Montaner, H{\'e}ctor and Silla, Federico and H. Fr{\"o}ning and Duato, Jose},
	abstract = "Current commercial solutions intended to provide additional resources to an application being executed in a cluster usually aggregate processors and memory from different nodes. In this paper we present a 16-node prototype for a shared-memory cluster architecture that follows a different approach by decoupling the amount of memory available to an application from the processing resources assigned to it. In this way, we provide a new degree of freedom so that the memory granted to a process can be expanded with the memory from other nodes in the cluster without increasing the number of processors used by the program. This feature is especially suitable for memory-hungry applications that demand large amounts of memory but present a parallelization level that prevents them from using more cores than available in a single node. The main advantage of this approach is that an application can use more memory from other nodes without involving the processors, and caches, from those nodes. As a result, using more memory no longer implies increasing the coherence protocol overhead because the number of caches involved in the coherent domain has become independent from the amount of available memory. The prototype we present in this paper leverages this idea by sharing 128GB of memory among the cluster. Real executions show the feasibility of our prototype and its scalability.",
	booktitle = "Cluster Computing (CLUSTER), 2010 IEEE International Conference on",
	doi = "10.1109/CLUSTER.2010.14",
	keywords = "16-node prototype;coherence protocol overhead;coherent domain;memory decoupling;memory hungry application;parallelization level;processing resource;shared memory cluster architecture;cache storage;memory architecture;pattern clustering;program processors;",
	month = "sept.",
	pages = "48 -57",
	title = "{G}etting {R}id of {C}oherency {O}verhead for {M}emory-{H}ungry {A}pplications",
	url = "http://dx.doi.org/10.1109/CLUSTER.2010.14",
	year = 2010
}

Carles Hernández, Antoni Roca, Federico Silla, Jose Flich and Jose Duato. Improving the Performance of GALS-Based NoCs in the Presence of Process Variation. In 2010 ACM/IEEE International Symposium on Networks-on-Chip (NOCS). May 2010, 35 - 42. URL, DOI BibTeX

@conference{ 11416504,
	author = "Hern{\'a}ndez, Carles and Roca, Antoni and Silla, Federico and Flich, Jose and Duato, Jose",
	abstract = "Current integration scales allow designing chip multiprocessors (CMP) where cores are interconnected by means of a network-on-chip (NoC). Unfortunately, the small feature size of current integration scales cause some unpredictability in manufactured devices because of process variation. In NoCs,variability may affect links and routers causing that they do not match the parameters established at design time. In this paper we first analyze the way that manufacturing deviations affect the components of a NoC by applying a comprehensive and detailed variability model to 200 instances of an 8×8 mesh NoC synthesized using 45 nm technology. A second contribution of this paper is showing that GALS-based NoCs present communication bottlenecks under process variation. To overcome this performance reduction we draft a novel approach, called performance domains, intended to reduce the negative impact of variability on application execution time. This mechanism is suitable when several applications are simultaneously running in the CMP chip.",
	address = "Grenoble, France",
	booktitle = "2010 ACM/IEEE International Symposium on Networks-on-Chip (NOCS)",
	doi = "10.1109/NOCS.2010.13",
	journal = "2010 ACM/IEEE International Symposium on Networks-on-Chip (NOCS)",
	keywords = "integrated circuit design;large scale integration;network-on-chip;performance evaluation;",
	month = "May",
	note = "GALS-based NoCs;chip multiprocessors;network-on-chip;manufacturing deviations;process variation;performance domains;integration scales;",
	pages = "35 - 42",
	publisher = "ACM",
	title = "{I}mproving the {P}erformance of {GALS}-{B}ased {N}o{C}s in the {P}resence of {P}rocess {V}ariation",
	url = "http://dx.doi.org/10.1109/NOCS.2010.13",
	year = 2010
}

Marina Alonso, Salvador Coll, Juan Miguel Martínez, Vicente Santonja, Pedro Lopez and Jose Duato. Power saving in regular interconnection networks. Parallel Computing 36(12):696 - 712, 2010. URL, DOI BibTeX

@article{ marinaalonso|coll2010696,
	author = "Alonso, Marina and Coll, Salvador and Mart{\'i}nez, Juan Miguel and Santonja, Vicente and Lopez, Pedro and Duato, Jose",
	abstract = "The high level of computing power required for some applications can only be achieved by multiprocessor systems. These systems consist of several processors that communicate by means of an interconnection network. The huge increase both in size and complexity of high-end multiprocessor systems has triggered up their power consumption. Complex cooling systems are needed, which, in turn, increases power consumption. Power consumption reduction techniques are being applied everywhere in computer systems and the interconnection network is not an exception, as its contribution is not negligible. In this paper, we propose a mechanism to reduce interconnect power consumption that combines two alternative techniques: (i) dynamically switching on and off network links as a function of traffic (any link can be switched off, provided that network connectivity is guaranteed), (ii) dynamically reducing the available network bandwidth when traffic becomes low. In both cases, the topology of the network is not modified. Therefore, the same routing algorithm can be used regardless of the power saving actions taken, thus simplifying router design. Our simulation results show that the network power consumption can be greatly reduced, at the expense of some increase in latency. However, the achieved power reduction is always higher than the latency penalty.",
	doi = "DOI: 10.1016/j.parco.2010.08.003",
	issn = "0167-8191",
	journal = "Parallel Computing",
	keywords = "Power saving; Interconnection networks; Routing",
	number = 12,
	pages = "696 - 712",
	title = "{P}ower saving in regular interconnection networks",
	url = "http://www.sciencedirect.com/science/article/B6V12-50VTWG7-1/2/7972b8869966237a0ab6b680fd5fa6ba",
	volume = 36,
	year = 2010
}

A Strano, Carles Hernández, Federico Silla and D Bertozzi. Process variation and layout mismatch tolerant design of source synchronous links for GALS networks-on-chip. In System on Chip (SoC), 2010 International Symposium on. 2010, 43 -48. URL, DOI BibTeX

@conference{ 5625539,
	author = "A. Strano and Hern{\'a}ndez, Carles and Silla, Federico and D. Bertozzi",
	abstract = "Synchronization interfaces in a network-on-chip (NoC) are becoming vulnerable points that need to be safeguarded against link delay variations and signal misalignments. This paper addresses the challenge of designing a process variation and layout mismatch tolerant link for GALS NoCs by implementing a self-calibration mechanism. A variation detector senses the variability-induced misalignment between data lines with themselves and with the transmitter clock routed with data in source synchronous links. Then, a suitable delayed replica of the transmitter clock is selected for safe sampling of misaligned data. The paper proves correct operation of the GALS link augmented with the variation detector and compares its reliability with that of a detector-less link, beyond proving robustness with respect to the delay variability affecting the detector itself.",
	booktitle = "System on Chip (SoC), 2010 International Symposium on",
	doi = "10.1109/ISSOC.2010.5625539",
	isbn = "978-1-4244-8279-5",
	keywords = "GALS networks-on-chip;layout mismatch tolerant design;link delay variations;process variation;self-calibration mechanism;signal misalignments;source synchronous links;synchronization interfaces;transmitter clock;delays;integrated circuit layout;network-on",
	month = "sept.",
	pages = "43 -48",
	title = "{P}rocess variation and layout mismatch tolerant design of source synchronous links for {GALS} networks-on-chip",
	url = "http://dx.doi.org/10.1109/ISSOC.2010.5625539",
	year = 2010
}

Jose Duato, Antonio José Peña, Federico Silla, Rafael Mayo and Enrique S Quintana-Ort. RCUDA: Reducing the number of GPU-based accelerators in high performance clusters. In High Performance Computing and Simulation (HPCS), 2010 International Conference on. 2010, 224 - 231. URL BibTeX

@conference{ 20103913258676,
	author = "Duato, Jose and Pe{\~n}a, Antonio Jos{\'e} and Silla, Federico and Rafael Mayo and Enrique S. Quintana-Ort",
	abstract = "The increasing computing requirements for GPUs (Graphics Processing Units) have favoured the design and marketing of commodity devices that nowadays can also be used to accelerate general purpose computing. Therefore, future high performance clusters intended for HPC (High Performance Computing) will likely include such devices. However, high-end GPU-based accelerators used in HPC feature a considerable energy consumption, so that attaching a GPU to every node of a cluster has a strong impact on its overall power consumption. In this paper we detail a framework that enables remote GPU acceleration in HPC clusters, thus allowing a reduction in the number of accelerators installed in the cluster. This leads to energy, acquisition, maintenance, and space savings. ©2010 IEEE.",
	address = "Caen, France",
	booktitle = "High Performance Computing and Simulation (HPCS), 2010 International Conference on",
	journal = "Proceedings of the 2010 International Conference on High Performance Computing and Simulation, HPCS 2010",
	key = "Energy conservation",
	keywords = "Energy utilization;Program processors;",
	note = "Clusters;CUDA;Energy saving;High performance computing;Virtualizations;",
	pages = "224 - 231",
	title = "{RCUDA}: {R}educing the number of {GPU}-based accelerators in high performance clusters",
	url = "http://dx.doi.org/10.1109/HPCS.2010.5547126",
	year = 2010
}

J M Montañana, M Koibuchi, H Matsutani and H Amano. Stabilizing Path Modification of Power-Aware On/Off Interconnection Networks. In Networking, Architecture and Storage (NAS), 2010 IEEE Fifth International Conference on. July 2010, 218 -227. URL, DOI BibTeX

@conference{ 5575649,
	author = "Monta{\~n}ana, J. M. and M. Koibuchi and H. Matsutani and H. Amano",
	abstract = "Power saving is required for interconnects of modern PC clusters as well as the performance improvement. To reduce the power consumption of switches with maintaining the performance, on/off link regulations that activate and deactivate the links based on the traffic load have been widely developed in interconnection networks. Depending on which operation is selected, link activation or deactivation, the available network resources are changed, thus requiring paths to be reconfigured. To maintain deadlock freedom of packet transfers, connectivity, and performance during the path changes, we propose to apply dynamic reconfiguration techniques that process packet transfer uninterruptedly to power-aware on/off interconnection networks. The dynamic network reconfiguration techniques stabilize the update of paths that are quite crucial to use power-aware on/off link techniques in interconnects of PC clusters. We investigate the performance and behavior of network reconfiguration technique as soon as the link activation or deactivation occurs. Evaluation results show that the simple dynamic reconfiguration techniques slightly reduce the peak packet latency and reconfiguration time of the change compared with existing static reconfiguration in on/off interconnection networks. A reconfiguration technique called Double Scheme reduces by up to 95% the peak packet latency caused by the on/off link operation.",
	booktitle = "Networking, Architecture and Storage (NAS), 2010 IEEE Fifth International Conference on",
	doi = "10.1109/NAS.2010.13",
	isbn = "978-1-4244-8133-0",
	keywords = "PC cluster;dynamic reconfiguration technique;on-off link regulations;path modification;power saving;power-aware on-off interconnection networks;power aware computing;power consumption;workstation clusters;",
	month = "july",
	pages = "218 -227",
	title = "{S}tabilizing {P}ath {M}odification of {P}ower-{A}ware {O}n/{O}ff {I}nterconnection {N}etworks",
	url = "http://dx.doi.org/10.1109/NAS.2010.13",
	year = 2010
}

C M Juan, G Toffetti, F Abad and José Cano Reyes. Tangible Cubes Used as the User Interface in an Augmented Reality Game for Edutainment. In Advanced Learning Technologies (ICALT), 2010 IEEE 10th International Conference on. July 2010, 599 -603. URL, DOI BibTeX

@conference{ 5572569,
	author = "C.M. Juan and G. Toffetti and F. Abad and Cano Reyes, Jos{\'e}",
	abstract = "In this paper, we present an Augmented Reality (AR) game for finding and learning about endangered animals in a fun way. It uses tangible cubes as the user interface. This game was included in the activity program of the Summer School of the Universidad Politecnica de Valencia. Forty-six children played the AR game and the equivalent real game. We have compared the results of the two games. The results indicate that children enjoyed playing the AR game more than playing the real game and that they perceived the AR game to be more fun than the real game. They also preferred the AR game to the real one. The children perceived the real game as being easier to play than the AR game. The children also seemed to learn about the subject of endangered animals.",
	booktitle = "Advanced Learning Technologies (ICALT), 2010 IEEE 10th International Conference on",
	doi = "10.1109/ICALT.2010.170",
	keywords = "Universidad Politecnica de Valencia;augmented reality game;edutainment;endangered animals;tangible cubes;user interface;augmented reality;computer aided instruction;computer games;entertainment;user interfaces;",
	month = "july",
	pages = "599 -603",
	title = "{T}angible {C}ubes {U}sed as the {U}ser {I}nterface in an {A}ugmented {R}eality {G}ame for {E}dutainment",
	url = "http://dx.doi.org/10.1109/ICALT.2010.170",
	year = 2010
}

Antoni Roca, Jose Flich, Federico Silla and Jose Duato. VCTlite: Towards an Efficient Implementation of Virtual Cut-Through Switching in On-Chip Networks. In 17th Int'l Conference on High Performance Computing (HiPC) In Press. December 2010. BibTeX

@conference{ roca-hipc10,
	author = "Roca, Antoni and Flich, Jose and Silla, Federico and Duato, Jose",
	address = "Goa,India",
	booktitle = "17th Int'l Conference on High Performance Computing (HiPC)",
	keywords = "on-chip networks; switching;",
	month = "December",
	title = "{VCT}lite: {T}owards an {E}fficient {I}mplementation of {V}irtual {C}ut-{T}hrough {S}witching in {O}n-{C}hip {N}etworks",
	volume = "In Press",
	year = 2010
}

Francisco Triviño, José L Sánchez, Francisco J Alfaro and Jose Flich. Virtualizing network-on-chip resources in chip-multiprocessors. Microprocessors and Microsystems In Press, Uncorrected Proof:-, 2010. URL, DOI BibTeX

@article{ triviño2010,
	author = "Francisco Trivi{\~n}o and Jos{\'e} L. S{\'a}nchez and Francisco J. Alfaro and Flich, Jose",
	abstract = "The number of cores on a single silicon chip is rapidly growing and chips containing tens or even hundreds of identical cores are expected in the future. To take advantage of multicore chips, multiple applications will run simultaneously. As a consequence, the traffic interferences between applications increases and the performance of individual applications can be seriously affected. In this paper, we improve the individual application performance when several applications are simultaneously running. This proposal is based on the virtualization concept and allows us to reduce execution time and network latency in a significant percentage.",
	doi = "DOI: 10.1016/j.micpro.2010.10.001",
	issn = "0141-9331",
	journal = "Microprocessors and Microsystems",
	keywords = "Network-on-chip",
	pages = "-",
	title = "{V}irtualizing network-on-chip resources in chip-multiprocessors",
	url = "http://www.sciencedirect.com/science/article/B6V0X-518TDT0-1/2/a0626334a6df097c5980c108d5606b62",
	volume = "In Press, Uncorrected Proof",
	year = 2010
}