Publications 2006-2009

Carles Hernández, Federico Silla, Vicente Santonja and Jose Duato. A new mechanism to deal with process variability in NoC links. In IPDPS 2009 - Proceedings of the 2009 IEEE International Parallel and Distributed Processing Symposium. 2009, IEEE Computer Societ. URL BibTeX

@conference{ 20094812508592,
	author = "Hern{\'a}ndez, Carles and Silla, Federico and Santonja, Vicente and Duato, Jose",
	abstract = "Associated with the ever growing integration scale of VLSI technologies is the increase in process variability, which makes silicon devices to become less predictable. In the context of network-on-chip (NoC), this variability affects the maximum frequency that could be sustained by each wire of the link that interconnects two cores in a CMP system. Reducing the clock frequency so that all wires can properly work is a trivial solution but, as variability increases, this approach causes an unacceptable performance penalty. In this paper, we propose a new technique to deal with the effects of variability on the links of the NoC that interconnects cores in a CMP system. This technique, called Phit Reduction (PR), retrieves most of the bandwidth still available in links containing wires that are not able to operate at the designed operating frequency. More precisely, our mechanism discards these slow wires and uses all the wires that can work at the design frequency. Two implementations are presented: Local Phit Reduction (LPR), oriented to fabrication processes with very high variability, which requires more hardware but provides higher performance; and Global Phit Reduction (GPR), that requires less additional hardware but is not able to extract all the available bandwidth. The performance evaluation presented in the paper confirms that LPR obtains good results both for low and high variability scenarios. Moreover, in most of our experiments LPR practically achieves the same performance than the ideal network. On the other hand, GPR is appropriate for systems where whithin-die variations are expected to be low. © 2009 IEEE.",
	address = "Rome, Italy",
	booktitle = "IPDPS 2009 - Proceedings of the 2009 IEEE International Parallel and Distributed Processing Symposium",
	journal = "IPDPS 2009 - Proceedings of the 2009 IEEE International Parallel and Distributed Processing Symposium",
	key = "Wire",
	keywords = "Bandwidth;Distributed parameter networks;Electric network topology;Machine design;Nanotechnology;Radar antennas;",
	note = "Available bandwidth;Clock frequency;Design frequencies;Fabrication process;High variability;Ideal network;In-process;Maximum frequency;Network on chip;New mechanisms;Operating frequency;Performance evaluation;Performance penalties;Process Variability;Silicon devices;Trivial solutions;VLSI technology;",
	pages = "IEEE Computer Societ",
	title = "{A} new mechanism to deal with process variability in {N}o{C} links",
	url = "http://dx.doi.org/10.1109/IPDPS.2009.5161048",
	year = 2009
}

Francisco J Alfaro, Jose L Sanchez and Jose Duato. A new strategy to manage the InfiniBand arbitration tables. Journal of Parallel and Distributed Computing 69(6):508 - 520, 2009. URL BibTeX

@article{ 20091912072389,
	author = "Francisco J. Alfaro and Jose L. Sanchez and Duato, Jose",
	abstract = "The InfiniBand Architecture (IBA) is an industry-standard architecture for server I/O and interprocessor communication. InfiniBand is extensively used for interconnection in high-performance clusters. It has been developed by the InfiniBandS M Trade Association (IBTA) to provide the levels of reliability, availability, performance, scalability, and quality of service (QoS) necessary for present and future server systems. The provision of QoS in data communication networks is currently the focus of much discussion and research in both industry and academia. In that sense, IBA enables QoS support with some mechanisms. In this paper, we examine these mechanisms and describe a way to use them. We propose a traffic segregation strategy based only on delay requirements. Moreover, we propose a very effective methodology to compute the virtual lane arbitration tables. Finally, we evaluate our proposal and performance results show that, with a correct traffic treatment at the output ports, every traffic class meets its QoS requirements. © 2009 Elsevier Inc. All rights reserved.",
	address = "6277 Sea Harbor Drive, Orlando, FL 32887-4900, United States",
	issn = 07437315,
	journal = "Journal of Parallel and Distributed Computing",
	key = "Quality of service",
	keywords = "Interconnection networks;Parallel processing systems;Queueing networks;Telecommunication networks;",
	note = "Arbitration;Clusters;Connection requirements;InfiniBand;QoS;",
	number = 6,
	pages = "508 - 520",
	title = "{A} new strategy to manage the {I}nfini{B}and arbitration tables",
	url = "http://dx.doi.org/10.1016/j.jpdc.2009.02.002",
	volume = 69,
	year = 2009
}

, Jose Flich, Jose Duato, H Eberle, N Gura and W Olesinski. A performance evaluation of 2D-mesh, ring, and crossbar interconnects for chip multi-processors. In Network on Chip Architectures, 2009. NoCArc 2009. 2nd International Workshop on. 2009, 51 -56. BibTeX

@conference{ 5375715,
	author = ", and Flich, Jose and Duato, Jose and H. Eberle and N. Gura and W. Olesinski",
	abstract = "As the number of processing nodes on chip multi-processors (CMPs) keeps increasing, providing efficient communication with the on-chip interconnect becomes increasingly critical. With 32-core CMP designs on the drawing table of engineers, there is a demand for accurate simulation models that capture all the complexities and interactions of the different design layers including the application, operating system, cache hierarchy, coherency protocol, and other on-chip resources. These components cannot be modeled anymore in isolation as unpredicted performance anomalies may arise once all the system variables are taken into account. In this paper, we present a simulation framework for CMP systems, focusing our attention on the on-chip network. We show preliminary results for the choice of key network parameters (topology, flit size) with respect to the behavior and performance of applications running on top of different network configurations. This paper tries to convey the need for an overall CMP system simulator as a way to accurately characterize the actual behavior of the on-chip network.",
	booktitle = "Network on Chip Architectures, 2009. NoCArc 2009. 2nd International Workshop on",
	keywords = "2D-mesh interconnects;32-core CMP designs;cache hierarchy;chip multi-processors;coherency protocol;crossbar interconnects;on-chip network;operating system;processing nodes;ring interconnects;integrated circuit design;integrated circuit interconnections;mi",
	month = "12-12",
	pages = "51 -56",
	title = "{A} performance evaluation of 2{D}-mesh, ring, and crossbar interconnects for chip multi-processors",
	year = 2009
}

Salvador Petit, Rafael Ubal, Julio Sahuquillo and Pedro Lopez. A power-aware hybrid RAM-CAM renaming mechanism for fast recovery. In Computer Design, 2009. ICCD 2009. IEEE International Conference on. 2009, 150 -157. URL, DOI BibTeX

@conference{ 5413160,
	author = "Petit, Salvador and Ubal, Rafael and Sahuquillo, Julio and Lopez, Pedro",
	abstract = "Modern superscalar processors implement register renaming by using either RAM or CAM tables. The design of these structures should address their access time and misprediction recovery penalty. While direct-mapped RAMs provide faster access times, CAMs are more appropriate to avoid recovery penalties. Although they are more complex and slower, CAMs usually match the processor cycle in current designs. However, they do not scale with the number of physical registers and the pipeline width. In this paper we present a new hybrid RAM-CAM register renaming scheme, which combines the best of both approaches. In a steady state, a RAM provides the current mappings quickly; on mispeculation, a low-complexity CAM enables immediate recovery and further register renaming. Compared to an ideal CAM in a 4-way state-of-the-art superscalar microprocessor, and for almost the same performance (1% slowdown) and area (95% of the ideal CAM size), the proposed scheme consumes about 90% less dynamic energy.",
	booktitle = "Computer Design, 2009. ICCD 2009. IEEE International Conference on",
	doi = "10.1109/ICCD.2009.5413160",
	issn = "1063-6404",
	keywords = "direct-mapped RAM;misprediction recovery penalty;physical registers;pipeline width;power-aware hybrid RAM-CAM renaming mechanism;processor cycle;register renaming;superscalar processors;microprocessor chips;power aware computing;random-access storage;",
	month = "oct.",
	pages = "150 -157",
	title = "{A} power-aware hybrid {RAM}-{CAM} renaming mechanism for fast recovery",
	url = "http://dx.doi.org/10.1109/ICCD.2009.5413160",
	year = 2009
}

A Martinez, P J Garcia, F J Alfaro, J L Sanchez, Jose Flich, F J Quiles and Jose Duato. A Switch Architecture Guaranteeing QoS Provision and HOL Blocking Elimination. Parallel and Distributed Systems, IEEE Transactions on 20(1):13 -24, 2009. DOI BibTeX

@article{ 4497190,
	author = "A. Martinez and P.J. Garcia and F.J. Alfaro and J.L. Sanchez and Flich, Jose and F.J. Quiles and Duato, Jose",
	abstract = "Both QoS support and congestion management techniques become essential to achieve good network performance in current high-speed interconnection networks. The most effective techniques traditionally considered for both issues, however, require too many resources for being implemented. In this paper we propose a new cost-effective switch architecture able to face the challenges of congestion management and, at the same time, to provide QoS. The efficiency of our proposal is based on using the resources (queues) used by RECN (an efficient Head-Of-Line blocking elimination technique) also for QoS support, without increasing queue requirements. Provided results show that the new switch architecture is able to guarantee QoS levels without any degradation due to congestion situations.",
	doi = "10.1109/TPDS.2008.62",
	issn = "1045-9219",
	journal = "Parallel and Distributed Systems, IEEE Transactions on",
	keywords = "HOL blocking elimination;QoS provision;congestion management;high-speed interconnection networks;network performance;switch architecture;quality of service;telecommunication congestion control;telecommunication network management;telecommunication switchi",
	month = "jan.",
	number = 1,
	pages = "13 -24",
	title = "{A} {S}witch {A}rchitecture {G}uaranteeing {Q}o{S} {P}rovision and {HOL} {B}locking {E}limination",
	volume = 20,
	year = 2009
}

Salvador Petit, Rafael Ubal, Julio Sahuquillo, Pedro Lopez and Jose Duato. An Efficient Low-Complexity Alternative to the ROB for Out-of-Order Retirement of Instructions. In Antonio Nunez; Pedro P Carballo (ed.). Digital System Design, Architectures, Methods and Tools, 2009. DSD '09. 12th Euromicro Conference on. 2009, 635 -642. URL, DOI BibTeX

@conference{ 5350186,
	author = "Petit, Salvador and Ubal, Rafael and Sahuquillo, Julio and Lopez, Pedro and Duato, Jose",
	abstract = "Current superscalar processors use a reorder buffer (ROB) to support speculation, precise exceptions, and register reclamation. Instructions are retired from this structure in program order, which may lead to significant performance degradation if a long latency operation blocks the ROB head. In this paper, a checkpoint-free out-of-order commit architecture is proposed, which replaces the ROB with a small structure called validation buffer (VB) from which instructions are retired as soon as their speculative state is resolved. An aggressive register reclamation mechanism targeted to this microarchitecture is also devised. Experimental results show that the VB microarchitecture is much more efficient than a ROB-based microprocessor. For example, a 32-entry VB provides similar performance to a 256-entry ROB, while reducing the utilization of other major processor structures.",
	booktitle = "Digital System Design, Architectures, Methods and Tools, 2009. DSD '09. 12th Euromicro Conference on",
	doi = "10.1109/DSD.2009.237",
	editor = "Antonio Nunez; Pedro P. Carballo",
	isbn = "978-0-7695-3782-5",
	keywords = "ROB-based microprocessor;checkpoint-free out-of-order commit architecture;out-of-order instruction retirement;register reclamation;register reclamation mechanism;superscalar reorder buffer processors;validation buffer;buffer circuits;microprocessor chips;",
	month = "aug.",
	pages = "635 -642",
	title = "{A}n {E}fficient {L}ow-{C}omplexity {A}lternative to the {ROB} for {O}ut-of-{O}rder {R}etirement of {I}nstructions",
	url = "http://dx.doi.org/10.1109/DSD.2009.237",
	year = 2009
}

D Ludovici, Francisco Gilabert, S Medardoni, Crispín Gomez, Maria E Gomez, Pedro Lopez, G N Gaydadjiev and D Bertozzi. Assessing fat-tree topologies for regular network-on-chip design under nanoscale technology constraints. In 2009 Design, Automation & Test in Europe Conference & Exhibition (DATE'09). 2009, 4 pp. -. BibTeX

@conference{ 10730481,
	author = "D. Ludovici and Gilabert, Francisco and S. Medardoni and Gomez, Crisp{\'i}n and Gomez, Maria E. and Lopez, Pedro and G.N. Gaydadjiev and D. Bertozzi",
	abstract = "Most of past evaluations of fat-trees for on-chip interconnection networks rely on oversimplifying or even irrealistic architecture and traffic pattern assumptions, and very few layout analyses are available to relieve practical feasibility concerns in nanoscale technologies. This work aims at providing an in-depth assessment of physical synthesis efficiency of fat-trees and at extrapolating silicon-aware performance figures to back-annotate in the system-level performance analysis. A 2D mesh is used as a reference architecture for comparison, and a 65 nm technology is targeted by our study. Finally, in an attempt to mitigate the implementation cost of k-ary n-tree topologies, we also review an alternative unidirectional multi-stage interconnection network which is able to simplify the fat-tree architecture and to minimally impact performance.",
	address = "Piscataway, NJ, USA",
	booktitle = "2009 Design, Automation {\&} Test in Europe Conference {\&} Exhibition (DATE'09)",
	journal = "2009 Design, Automation {{\&}}amp; Test in Europe Conference {{\&}}amp; Exhibition (DATE'09)",
	keywords = "extrapolation;integrated circuit interconnections;integrated circuit layout;nanoelectronics;network topology;network-on-chip;",
	note = "fat-tree topology;network-on-chip design;nanoscale technology;on-chip interconnection network;traffic pattern;layout analysis;extrapolation;system-level performance analysis;",
	pages = "4 pp. -",
	title = "{A}ssessing fat-tree topologies for regular network-on-chip design under nanoscale technology constraints",
	year = 2009
}

J M Montañana, M Koibuchi, H Matsutani and H Amano. Balanced Dimension-Order Routing for k-ary n-cubes. In Parallel Processing Workshops, 2009. ICPPW '09. International Conference on. 2009, 499 -506. URL, DOI BibTeX

@conference{ 5365405,
	author = "Monta{\~n}ana, J. M. and M. Koibuchi and H. Matsutani and H. Amano",
	abstract = "Current Network-on-Chip (NoC) architectures sometimes employ mesh or torus topology with the dimension-order routing. In this paper, we propose a deadlock-free routing algorithm, referred to as Balanced Dimension-Order Routing (BDOR), which provides the balanced minimal paths to each destination based on the simple routing regulations. Since the BDOR has the similar path regularity to that of the dimension-order routing, its implementation can be lightweight, and most of its modules can be borrowed from the router for the dimension-order routing. Evaluation results show that the BDOR router increases by 3.4% hardware amount compared with the router for the dimension-order routing. Also show that the throughput of the BDOR outperforms on average up to 14% that of the dimension-order routing on two-dimensional mesh and torus.",
	booktitle = "Parallel Processing Workshops, 2009. ICPPW '09. International Conference on",
	doi = "10.1109/ICPPW.2009.64",
	isbn = "978-1-4244-4923-1",
	issn = "1530-2016",
	keywords = "balanced dimension-order routing;deadlock-free routing algorithm;k-ary n-cubes;mesh topology;network-on-chip architecture;path regularity;routing regulation;torus topology;concurrency control;hypercube networks;network routing;network topology;network-on-",
	month = "sept.",
	pages = "499 -506",
	title = "{B}alanced {D}imension-{O}rder {R}outing for k-ary n-cubes",
	url = "http://dx.doi.org/10.1109/ICPPW.2009.64",
	year = 2009
}

Vicente Chirivella, Rosa Alcover, Jose Flich and Jose Duato. Dependability analysis of a fault-tolerant network reconfiguring strategy. In Henk Sips; Dick Epema; Hai-Xiang Lin (ed.). Euro-Par 2009 Parallel Processing 5704. August 2009, 1040 - 1051. URL, DOI BibTeX

@conference{ 20094612441323,
	author = "Chirivella, Vicente and Alcover, Rosa and Flich, Jose and Duato, Jose",
	abstract = "Fault tolerance mechanisms become indispensable as the number of processors increases in large systems. Measuring the effectiveness of such mechanisms before its implementation becomes mandatory. Research toward understanding the effects of different network parameters on the dependability parameters, like mean time to network failure or availability, becomes necessary. In this paper we analyse in detail such effects with a methodology proposed previously by us. This methodology is based on Markov chains and Analysis of Variance techniques. As a case study we analyse the effects of network size, mean time to node failure, mean time to node repair, mean time to network repair and coverage of the failure when using a 2D mesh network with a fault-tolerant mechanism (similar to the one used in the BlueGene/L system), that is able to remove rows and/or columns in the presence of failures. © 2009 Springer.",
	address = "Delft, Netherlands",
	booktitle = "Euro-Par 2009 Parallel Processing",
	doi = "10.1007/978-3-642-03869-3_96",
	editor = "Henk Sips; Dick Epema; Hai-Xiang Lin",
	isbn = "978-3-642-03869-3",
	issn = "0302-9743",
	journal = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
	key = "Fault tolerant computer systems",
	keywords = "Artificial intelligence;Bioinformatics;Fault tolerance;Markov processes;Quality assurance;Regression analysis;",
	month = "Aug",
	note = "BlueGene/L systems;Dependability analysis;Fault tolerance mechanisms;Fault-tolerant mechanism;Fault-tolerant networks;Large system;Markov Chain;Mesh network;Network failure;Network parameters;Network size;Node failure;",
	pages = "1040 - 1051",
	publisher = "Springer",
	series = "Lecture Notes in Computer Science",
	title = "{D}ependability analysis of a fault-tolerant network reconfiguring strategy",
	url = "http://dx.doi.org/10.1007/978-3-642-03869-3_96",
	volume = 5704,
	year = 2009
}

Alberto Ros, M Cintra, M E Acacio and J M Garcia. Distance-aware round-robin mapping for large NUCA caches. In High Performance Computing (HiPC), 2009 International Conference on. 2009, 79 -88. URL, DOI BibTeX

@conference{ 5433220,
author = "Ros, Alberto and M. Cintra and M.E. Acacio and J.M. Garcia",
abstract = "In many-core architectures, memory blocks are commonly assigned to the banks of a NUCA cache by following a physical mapping. This mapping assigns blocks to cache banks in a round-robin fashion, thus neglecting the distance between the cores that most frequently access every block and the corresponding NUCA bank for the block. This issue impacts both cache access latency and the amount of on-chip network traffic generated. On the other hand, first-touch mapping policies, which take into account distance, can lead to an unbalanced utilization of cache banks, and consequently, to an increased number of expensive off-chip accesses. In this work, we propose the distance-aware round-robin mapping policy, an OS-managed policy which addresses the trade-off between cache access latency and number of off-chip accesses. Our policy tries to map the pages accessed by a core to its closest (local) bank, like in a first-touch policy. However, our policy also introduces an upper bound on the deviation of the distribution of memory pages among cache banks, which lessens the number of off-chip accesses. This tradeoff is addressed without requiring any extra hardware structure. We also show that the private cache indexing commonly used in many-core architectures is not the most appropriate for OS-managed distance-aware mapping policies, and propose to employ different bits for such indexing. Using GEMS simulator we show that our proposal obtains average improvements of 11% for parallel applications and 14% for multi-programmed workloads in terms of execution time, and significant reductions in network traffic, over a traditional physical mapping. Moreover, when compared to a first-touch mapping policy, our proposal improves average execution time by 5% for parallel applications and 6% for multi-programmed workloads, slightly increasing on-chip network traffic.",
booktitle = "High Performance Computing (HiPC), 2009 International Conference on",
doi = "10.1109/HIPC.2009.5433220",
keywords = "GEMS simulator;OS-managed policy;cache access latency;cache banks;distance-aware round-robin mapping;first-touch mapping policy;many-core architectures;memory blocks;multiprogrammed workloads;nonuniform cache architecture;on-chip network traffic;parallel",
month = "dec.",
pages = "79 -88",
title = "{D}istance-aware round-robin mapping for large {NUCA} caches",
url = "http://dx.doi.org/10.1109/HIPC.2009.5433220",
year = 2009
}

Salvador Coll, Francisco J Mora, Jose Duato and Fabrizio Petrini. Efficient and scalable hardware-based multicast in fat-tree networks. IEEE Transactions on Parallel and Distributed Systems 20(9):1285 - 1298, 2009. URL, DOI BibTeX

@article{ 20093412267181,
	author = "Coll, Salvador and Francisco J. Mora and Duato, Jose and Fabrizio Petrini",
	abstract = "This article presents an efficient and scalable mechanism to overcome the limitations of collective communication in switched interconnection networks in the presence of faults. Considering that current trends in supercomputing are moving toward massively parallel computers, with many thousands of components, reliability becomes a challenge. In such scenario, fat-tree networks that provide hardware support for collective communication suffer from serious performance degradation due to the presence of, even, a single faulty node. This paper describes a new mechanism to provide high-performance collective communication in such situations. The feasibility of the proposed technique is formally demonstrated. We present the design of a new hardware-based routing algorithm for multicast, that is at the base of our proposal. The proposed mechanism is implemented and experimentally evaluated. Our experimental results show that hardware-based multicast trees provide an efficient and scalable solution for collective communication in fat-tree networks, significantly outperforming traditional solutions. © 2009 IEEE.",
	address = "445 Hoes Lane - P.O.Box 1331, Piscataway, NJ 08855-1331, United States",
	doi = "10.1109/TPDS.2008.228",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Communication",
	keywords = "Computer hardware;Convolutional codes;Multicasting;Routing algorithms;Switching circuits;",
	note = "Data communications;Interprocessor communications;Multicast;Network communication;Network problems;Trees;",
	number = 9,
	pages = "1285 - 1298",
	title = "{E}fficient and scalable hardware-based multicast in fat-tree networks",
	url = "http://dx.doi.org/10.1109/TPDS.2008.228",
	volume = 20,
	year = 2009
}

Samuel Rodrigo, S Medardoni, Jose Flich, D Bertozzi and Jose Duato. Efficient implementation of distributed routing algorithms for NoCs. Computers Digital Techniques, IET 3(5):460 -475, September 2009. DOI BibTeX

@article{ 5200571,
	author = "Rodrigo, Samuel and S. Medardoni and Flich, Jose and D. Bertozzi and Duato, Jose",
	abstract = "Chip multiprocessors (CMPs) are gaining momentum in the high-performance computing domain. Networks-on-chip (NoCs) are key components of CMP architectures, in that they have to deal with the communication scalability challenge while meeting tight power, area and latency constraints. 2D mesh topologies are usually preferred by designers of general purpose NoCs. However, manufacturing faults may break their regularity. Moreover, resource management frameworks may require the segmentation of the network into irregular regions. Under these conditions, efficient routing becomes a challenge. Although the use of routing tables at switches is flexible, it does not scale in terms of latency and area due to its memory requirements. Logic-based distributed routing (LBDR) is proposed as a new routing method that removes the need for routing tables at all. LBDR enables the implementation of many routing algorithms on most of the practical topologies we may find in the near future in a multi-core system. From an initial topology and routing algorithm, a set of three bits per switch/output port is computed. Evaluation results show that, by using a small logic, LBDR mimics the performance of routing algorithms when implemented with routing tables, both in regular and irregular topologies. LBDR implementation in a real NoC switch is also explored, proving its smooth integration in the architecture and its negligible hardware and performance overhead.",
	doi = "10.1049/iet-cdt.2008.0092",
	issn = "1751-8601",
	journal = "Computers Digital Techniques, IET",
	keywords = "2D mesh topologies;chip multiprocessors;communication scalability;distributed routing algorithms;high-performance computing domain;logic-based distributed routing;manufacturing faults;multicore system;network segmentation;networks-on-chip;resource managem",
	month = "september",
	number = 5,
	pages = "460 -475",
	title = "{E}fficient implementation of distributed routing algorithms for {N}o{C}s",
	volume = 3,
	year = 2009
}

T Skeie, F O Sem-Jacobsen, Samuel Rodrigo, Jose Flich, D Bertozzi and S Medardoni. Flexible DOR routing for virtualization of multicore chips. In System-on-Chip, 2009. SOC 2009. International Symposium on. 2009, 073 -076. DOI BibTeX

@conference{ 5335673,
	author = "T. Skeie and F.O. Sem-Jacobsen and Rodrigo, Samuel and Flich, Jose and D. Bertozzi and S. Medardoni",
	abstract = "The expected increase in number of cores on a single chip leads to the necessity of high-performance on chip interconnects (NoC). Furthermore, in order to fully utilize the abundance of cores, the chip is expected to support a number of applications running on the chip simultaneously. It is therefore necessary to partition the chip to support numerous applications without any risk of interference between them. The success of this depends on the flexibility of the underlying routing algorithm. This paper presents a flexible routing algorithm based on dimension ordered routing, which supports a large variety of irregular (2-D and 3-D) mesh topologies. The algorithm provides high efficiency at very low additional complexity, as is confirmed by experimental results.",
	booktitle = "System-on-Chip, 2009. SOC 2009. International Symposium on",
	doi = "10.1109/SOCC.2009.5335673",
	keywords = "dimension order routing;mesh topologies;multicore chips virtualization;on chip interconnects;routing algorithm;integrated circuit interconnections;network-on-chip;",
	month = "5-7",
	pages = "073 -076",
	title = "{F}lexible {DOR} routing for virtualization of multicore chips",
	year = 2009
}

R Holsmark, S Kumar, M Palesi and . HiRA: A methodology for deadlock free routing in hierarchical networks on chip. In Networks-on-Chip, 2009. NoCS 2009. 3rd ACM/IEEE International Symposium on. May 2009, 2 -11. URL, DOI BibTeX

@conference{ 5071439,
	author = "R. Holsmark and S. Kumar and M. Palesi and ,",
	abstract = "Complexity of designing large and complex NoCs can be reduced/managed by using the concept of hierarchical networks. In this paper, we propose a methodology for design of deadlock free routing algorithms for hierarchical networks, by combining routing algorithms of component subnets. Specifically, our methodology ensures reachability and deadlock freedom for the complete network if routing algorithms for subnets are deadlock free. We evaluate and compare the performance of hierarchical routing algorithms designed using our methodology with routing algorithms for corresponding flat networks. We show that hierarchical routing, combining best routing algorithm for each subnet, has a potential for providing better performance than using any single routing algorithm. This is observed for both synthetic as well as traffic from real applications. We also demonstrate, by measuring jitter in throughput, that hierarchical routing algorithms leads to smoother flow of network traffic. A router architecture that supports scalable table-based routing is briefly outlined.",
	booktitle = "Networks-on-Chip, 2009. NoCS 2009. 3rd ACM/IEEE International Symposium on",
	doi = "10.1109/NOCS.2009.5071439",
	keywords = "deadlock free routing;hierarchical networks;hierarchical routing algorithms;jitter;network traffic;network-on-chip;concurrency control;hierarchical systems;interconnected systems;jitter;network routing;network-on-chip;operating systems;",
	month = "may",
	pages = "2 -11",
	title = "{H}i{RA}: {A} methodology for deadlock free routing in hierarchical networks on chip",
	url = "http://dx.doi.org/10.1109/NOCS.2009.5071439",
	year = 2009
}

P Morillo, J M Orduna and Jose Duato. M-GRASP: a GRASP with memory for latency-aware partitioning methods in DVE systems. IEEE Transactions on Systems, Man and Cybernetics, Part A (Systems and Humans) 39(6):1214 - 23, 2009. URL BibTeX

@article{ 10919102,
	author = "P. Morillo and J.M. Orduna and Duato, Jose",
	abstract = "A necessary condition for providing quality of service to distributed virtual environments (DVEs) is to provide a system response below a maximum threshold to the client computers. In this sense, latency-aware partitioning methods try to provide response times below the threshold to the maximum number of client computers as possible. These partitioning methods should find an assignment of clients to servers that optimizes system throughput, system latency, and partitioning efficiency. In this paper, we present a new algorithm based on greedy randomized adaptive search procedure with memory for finding the best solutions as possible to this problem. We take into account several different alternatives in order to design both the constructive phase and the local search phase of this multistart metaheuristic for combinatorial problems. Additionally, we enhance this basic approach with some intensification strategies that improve the efficiency of the basic search method. Performance evaluation results show that the new algorithm increases the performance provided by other metaheuristics when applied to solve the latency-aware partitioning problem in DVE systems.",
	address = "USA",
	issn = "1083-4427",
	journal = "IEEE Transactions on Systems, Man and Cybernetics, Part A (Systems and Humans)",
	keywords = "client-server systems;combinatorial mathematics;greedy algorithms;quality of service;randomised algorithms;search problems;virtual reality;",
	note = "M-GRASP;distributed virtual environments;quality of service;latency-aware partitioning methods;system latency;greedy randomized adaptive search procedure;local search phase;combinatorial problems;DVE system;",
	number = 6,
	pages = "1214 - 23",
	title = "{M}-{GRASP}: a {GRASP} with memory for latency-aware partitioning methods in {DVE} systems",
	url = "http://dx.doi.org/10.1109/TSMCA.2009.2025024",
	volume = 39,
	year = 2009
}

, M Palesi, Jose Flich, S Kumar, Pedro Lopez, R Holsmark and Jose Duato. Region-Based Routing: A Mechanism to Support Efficient Routing Algorithms in NoCs. Very Large Scale Integration (VLSI) Systems, IEEE Transactions on 17(3):356 -369, March 2009. URL, DOI BibTeX

@article{ 4804124,
	author = ", and M. Palesi and Flich, Jose and S. Kumar and Lopez, Pedro and R. Holsmark and Duato, Jose",
	abstract = "An efficient routing algorithm is important for large on-chip networks [network-on-chip (NoC)] to provide the required communication performance to applications. Implementing NoC using table-based switches provide many advantages, including possibility of changing routing algorithms and fault tolerance, due to the option of table reconfigurations. However, table-based switches have been considered unsuitable for NoCs due to their perceived high area and power consumption. In this paper, we describe the region-based routing (RBR) mechanism which groups destinations into network regions allowing an efficient implementation with logic blocks. RBR can also be viewed as a mechanism to reduce the number of entries in routing tables. RBR is general and can be used in conjunction with any adaptive routing algorithm. In particular, we have evaluated the proposed scheme in conjunction with a general routing algorithm, namely segment-based routing (SR) and an application specific routing algorithm (APSRA) using regular and irregular mesh topologies. Our study shows that the number of entries in the table is significantly reduced, especially for large networks. Evaluation results show that RBR requires only four regions to support several routing algorithms in a 2-D mesh with no performance degradation. Considering link failures, our results indicate that RBR combined with SR is able to tolerate up to 7 link failures in an 8times8 mesh. RBR also reduces area and power dissipation of an equivalent table-based implementation by factors of 8 and 10, respectively. Moreover, the degradation in performance of the network is insignificant when using APSRA combined with RBR.",
	doi = "10.1109/TVLSI.2008.2012010",
	issn = "1063-8210",
	journal = "Very Large Scale Integration (VLSI) Systems, IEEE Transactions on",
	keywords = "adaptive routing algorithm;application specific routing algorithm;fault tolerance;large on-chip networks;network-on-chip;region-based routing mechanism;segment-based routing;table-based switches;network topology;network-on-chip;",
	month = "march",
	number = 3,
	pages = "356 -369",
	title = "{R}egion-{B}ased {R}outing: {A} {M}echanism to {S}upport {E}fficient {R}outing {A}lgorithms in {N}o{C}s",
	url = "http://dx.doi.org/10.1109/TVLSI.2008.2012010",
	volume = 17,
	year = 2009
}

Samuel Rodrigo, Carles Hernández, Jose Flich, Federico Silla, Jose Duato, S Medardoni, D Bertozzi, D Dai and . Yield-oriented evaluation methodology of network-on-chip routing implementations. In System-on-Chip, 2009. SOC 2009. International Symposium on. 2009, 100 -105. URL, DOI BibTeX

@conference{ 5335667,
	author = "Rodrigo, Samuel and Hern{\'a}ndez, Carles and Flich, Jose and Silla, Federico and Duato, Jose and S. Medardoni and D. Bertozzi and D. Dai and ,",
	abstract = "Network-on-Chip technology is gaining wide popularity for the interconnection of an increasing number of processor cores on the same silicon die. However, growing process variations cause interconnect malfunction or prevent the network from working at the intended frequency, directly impacting yield and manufacturing cost. Topology agnostic routing algorithms have the potential to tolerate process variations without degrading performance. We propose a three step methodology for evaluating routing algorithms in their ability to deal with variability. Using yield enhancement and operation speed preservation as the criteria, we demonstrate how this methodology can be used to select the best design choice among several plausible combinations of routing algorithms and implementations. Also, we show how an efficient table-less routing implementation can be used to minimise the impact of variability on manufacturing and operating frequency.",
	booktitle = "System-on-Chip, 2009. SOC 2009. International Symposium on",
	doi = "10.1109/SOCC.2009.5335667",
	keywords = "Si;interconnect malfunction;network-on-chip routing;processor core interconnection;silicon die;yield enhancement;yield operation;yield oriented evaluation;integrated circuit interconnections;integrated circuit yield;microprocessor chips;network-on-chip;si",
	month = "oct.",
	pages = "100 -105",
	title = "{Y}ield-oriented evaluation methodology of network-on-chip routing implementations",
	url = "http://dx.doi.org/10.1109/SOCC.2009.5335667",
	year = 2009
}

Rafael Tornero, Juan M Orduna, Maurizio Palesi and Jose Duato. A communication-aware topological mapping technique for NoCs. 2008, 910 - 919. URL BibTeX

@conference{ 20083911589416,
	author = "Rafael Tornero and Juan M. Orduna and Maurizio Palesi and Duato, Jose",
	abstract = "Networks-on-Chip (NoCs) have been proposed as a promising solution to the complex on-chip communication problems derived from the increasing number of processor cores. The design of NoCs involves several key issues, being the topological mapping (the mapping of the Intellectual Properties (IPs) to network nodes) one of them. Several proposals have been focused on topological mapping last years, but they require the experimental validation of each mapping considered. In this paper, we propose a communication-aware topological mapping technique for NoCs. This technique is based on the experimental correlation of the network model with the actual network performance, thus avoiding the need to experimentally evaluate each mapping explored. The evaluation results show that the proposed technique can provide better performance than the currently existing techniques (in terms of both network latency and energy consumption). Additionally, it can be used for both regular and irregular topologies. © 2008 Springer-Verlag Berlin Heidelberg.",
	address = "Las Palmas de Gran Canaria, Spain",
	issn = 03029743,
	journal = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
	key = "Conformal mapping",
	keywords = "Biological materials;Chlorine compounds;Communication;Energy policy;Microprocessor chips;Systems engineering;Telecommunication;Topology;",
	note = "Energy consumption;Evaluation results;Experimental validations;Network latencies;Network modelling;Network nodes;Network performances;Networks-on-chip;On-chip communications;Parallel processing;Processor cores;Topological mapping;",
	pages = "910 - 919",
	title = "{A} communication-aware topological mapping technique for {N}o{C}s",
	url = "http://dx.doi.org/10.1007/978-3-540-85451-7_98",
	volume = "5168 LNCS",
	year = 2008
}

Ricardo Fernandez-Pascual, Jose M Garcia, Manuel E Acacio and Jose Duato. A fault-tolerant directory-based cache coherence protocol for CMP architectures. 2008, 267 - 276. URL BibTeX

@conference{ 20084211640662,
	author = "Ricardo Fernandez-Pascual and Jose M. Garcia and Manuel E. Acacio and Duato, Jose",
	abstract = "Current technology trends of increased scale of integration are pushing CMOS technology into the deepsubmicron domain, enabling the creation of chips with a significantly greater number of transistors but also more prone to transient failures. Hence, computer architects will have to consider reliability as a prime concern for future chip-multiprocessor designs (CMPs). Since the interconnection network of future CMPs will use a significant portion of the chip real state, it will be especially affected by transient failures. We propose to deal with this kind of failures at the level of the cache coherence protocol instead of ensuring the reliability of the network itself. Particularly, we have extended a directory-based cache coherence protocol to ensure correct program semantics even in presence of transient failures in the interconnection network. Additionally, we show that our proposal has virtually no impact on execution time with respect to a non fault-tolerant protocol, and just entails modest hardware and network traffic overhead. © 2008 IEEE.",
	address = "Anchorage, AK, United states",
	journal = "Proceedings of the International Conference on Dependable Systems and Networks",
	key = "Computer networks",
	keywords = "CMOS integrated circuits;Coherent light;Information theory;Interconnection networks;Internet;Nanotechnology;Network architecture;Network protocols;Reliability;Sensor networks;",
	note = "Cache coherence protocols;CMOS technologies;CMP architectures;Computer architects;Current technologies;Dependable systems;Execution time;Fault-tolerant;Fault-tolerant protocols;International conferences;Multiprocessor designs;Network traffics;Program semantics;Real state;Scale of integration;",
	pages = "267 - 276",
	title = "{A} fault-tolerant directory-based cache coherence protocol for {CMP} architectures",
	url = "http://dx.doi.org/10.1109/DSN.2008.4630095",
	year = 2008
}

Antonio Robles, Aurelio Bermudez, Rafael Casado, Francisco J Quiles, Tor Skeie and Jose Duato. A proposal for managing ASI fabrics. Journal of Systems Architecture 54(7):664 - 678, 2008. URL BibTeX

@article{ 20083011398981,
	author = "Robles, Antonio and Aurelio Bermudez and Rafael Casado and Francisco J. Quiles and Tor Skeie and Duato, Jose",
	abstract = "Recent years, computer performance has been significantly increased. As a consequence, data I/O systems have become bottlenecks within systems. To alleviate this problem, Advanced Switching was recently proposed as a new standard for future interconnects. The Advanced Switching specification establishes a fabric management infrastructure, which is in charge of updating the set of fabric paths each time a topological change takes place. The use of source routing and passive switches makes unfeasible the adaptation to this new technology of many existing proposals to handle topological changes in switched interconnection networks. This paper presents a fabric management mechanism for Advanced Switching, but also suitable for other source routing interconnects. Furthermore, the work presents a detailed performance evaluation for this proposal. This evaluation allows us to identify the main drawbacks of the mechanism and to define future improvements. © 2007 Elsevier B.V. All rights reserved.",
	address = "P.O. Box 211, Amsterdam, 1000 AE, Netherlands",
	issn = 13837621,
	journal = "Journal of Systems Architecture",
	key = "Fabrics",
	keywords = "Mechanisms;Standards;Switching circuits;Topology;",
	note = "Advanced switching;Computer performance;Elsevier (CO);I/O systems;Management Infrastructure;New technologies;Passive switches;Performance evaluation (PE);Source routing;Topological changes;",
	number = 7,
	pages = "664 - 678",
	title = "{A} proposal for managing {ASI} fabrics",
	url = "http://dx.doi.org/10.1016/j.sysarc.2007.12.002",
	volume = 54,
	year = 2008
}

C Juan, F Beatrice and José Cano Reyes. An Augmented Reality System for Learning the Interior of the Human Body. In Advanced Learning Technologies, 2008. ICALT '08. Eighth IEEE International Conference on. July 2008, 186 -188. URL, DOI BibTeX

@conference{ 4561662,
	author = "C. Juan and F. Beatrice and Cano Reyes, Jos{\'e}",
	abstract = "Augmented Reality has been used for developing systems with learning purposes. In this paper, we present an Augmented Reality system for learning the interior of the human body. We have tested the system with children of the Summer School of the Technical University of Valencia. In this test we have analysed if the use of a Head-Mounted Display or a typical monitor influence in the experience of the children. Results do not offer statistical significant differences using both visualization systems and confirm that children enjoyed learning with the system and consider it as useful tool not only for learning the interior of the human body but also for learning other subjects.",
	booktitle = "Advanced Learning Technologies, 2008. ICALT '08. Eighth IEEE International Conference on",
	doi = "10.1109/ICALT.2008.121",
	keywords = "augmented reality system;children learning tool;computer monitor;head-mounted display;human body interior learning system;visualization system;augmented reality;biomedical education;computer aided instruction;computer displays;data visualisation;helmet mo",
	month = "july",
	pages = "186 -188",
	title = "{A}n {A}ugmented {R}eality {S}ystem for {L}earning the {I}nterior of the {H}uman {B}ody",
	url = "http://dx.doi.org/10.1109/ICALT.2008.121",
	year = 2008
}

Jose Flich, Samuel Rodrigo and Jose Duato. An Efficient Implementation of Distributed Routing Algorithms for NoCs. In Networks-on-Chip, 2008. NoCS 2008. Second ACM/IEEE International Symposium on. 2008, 87 -96. DOI BibTeX

@conference{ 4492728,
	author = "Flich, Jose and Rodrigo, Samuel and Duato, Jose",
	abstract = "The design of NoCs for multi-core chips introduces new design constraints like power consumption, area, and ultra low latencies. Although 2D meshes are preferred, heterogeneous blocks, fabrication faults, reliability issues, and chip virtualization may lead to the need of irregular topologies or regions. In this situation, efficient routing becomes a challenge. Although the use of routing tables at switches is flexible, it does not scale in terms of latency and area due to its memory requirements. LBDR (logic-based distributed routing) is proposed as a new routing method that removes the need of using routing tables at all. LBDR enables the implementation of many routing algorithms on most of the practical topologies we might find in the near future in a multi-core system. From an initial topology and routing algorithm, a set of three bits per switch/output port is computed. Evaluation results show that, by using a small logic, LBDR mimics the performance of routing algorithms when implemented with routing tables, both in regular and irregular topologies.",
	booktitle = "Networks-on-Chip, 2008. NoCS 2008. Second ACM/IEEE International Symposium on",
	doi = "10.1109/NOCS.2008.4492728",
	keywords = "NoC;distributed routing algorithm;logic-based distributed routing;multicore chip;network-on-chip;routing tables;network routing;network-on-chip;",
	month = "7-10",
	pages = "87 -96",
	title = "{A}n {E}fficient {I}mplementation of {D}istributed {R}outing {A}lgorithms for {N}o{C}s",
	year = 2008
}

Crispín Gomez, Maria E Gomez, Pedro Lopez and Jose Duato. An Efficient Switching Technique for NoCs with Reduced Buffer Requirements. In Parallel and Distributed Systems, 2008. ICPADS '08. 14th IEEE International Conference on. 2008, 713 -720. URL, DOI BibTeX

@conference{ 4724384,
	author = "Gomez, Crisp{\'i}n and Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "Networks on chip (NoCs) communicate the components located inside a chip. Overall system performance depends on NoC performance, that is affected by several factors. One of them is the network clock frequency, imposed by the critical path delay. Recent works show that switch critical path includes buffer control logic. Consequently, by removing switch buffers, switch frequency can be doubled. In this paper, we exploit this idea, proposing a new switching technique for NoCs which requires a reduced amount of storage at the switches. It is based on replacing switch port buffers by single latches. By doing so, network cycle can be reduced, which reduces packet latency. On the other hand, power and area consumption requirements can be reduced. However, since there are no buffers at the switch ports, packets can not be stopped. Stopped packets due to contention are dropped and reinjected from their senders via negative acknowledgments. Packet dropping is strongly reduced by exploiting NoCs wiring capability.",
	booktitle = "Parallel and Distributed Systems, 2008. ICPADS '08. 14th IEEE International Conference on",
	doi = "10.1109/ICPADS.2008.43",
	issn = "1521-9097",
	keywords = "buffer control logic;critical path delay;network clock frequency;network cycle;networks on chip;packet dropping;reduced buffer requirements;switching technique;network-on-chip;performance evaluation;",
	month = "dec.",
	pages = "713 -720",
	title = "{A}n {E}fficient {S}witching {T}echnique for {N}o{C}s with {R}educed {B}uffer {R}equirements",
	url = "http://dx.doi.org/10.1109/ICPADS.2008.43",
	year = 2008
}

Crispín Gomez, Maria E Gomez, Pedro Lopez and Jose Duato. An efficient switching technique for NoCs with reduced buffer requirements. In Parallel and Distributed Systems, 2008. ICPADS '08. 14th IEEE International Conference on. 2008, 713 - 20. URL BibTeX

@conference{ 10428505,
	author = "Gomez, Crisp{\'i}n and Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "Networks on chip (NoCs) communicate the components located inside a chip. Overall system performance depends on NoC performance, that is affected by several factors. One of them is the network clock frequency, imposed by the critical path delay. Recent works show that switch critical path includes buffer control logic. Consequently, by removing switch buffers, switch frequency can be doubled. In this paper, we exploit this idea, proposing a new switching technique for NoCs which requires a reduced amount of storage at the switches. It is based on replacing switch port buffers by single latches. By doing so, network cycle can be reduced, which reduces packet latency. On the other hand, power and area consumption requirements can be reduced. However, since there are no buffers at the switch ports, packets can not be stopped. Stopped packets due to contention are dropped and reinjected from their senders via negative acknowledgments. Packet dropping is strongly reduced by exploiting NoCs wiring capability.",
	address = "Piscataway, NJ, USA",
	booktitle = "Parallel and Distributed Systems, 2008. ICPADS '08. 14th IEEE International Conference on",
	journal = "Proceedings of the Fourteenth International Conference on Parallel and Distributed Systems",
	keywords = "network-on-chip;performance evaluation;",
	note = "switching technique;reduced buffer requirements;networks on chip;network clock frequency;critical path delay;buffer control logic;network cycle;packet dropping;",
	pages = "713 - 20",
	title = "{A}n efficient switching technique for {N}o{C}s with reduced buffer requirements",
	url = "http://dx.doi.org/10.1109/ICPADS.2008.43",
	year = 2008
}

Crispin Gomez Requena, Francisco Gilabert Villamon, Maria E Gomez, Pedro Lopez and Jose Duato. Beyond fat - Tree: Unidirectional load - Balanced multistage interconnection network. IEEE Computer Architecture Letters 7(2):49 - 52, 2008. URL BibTeX

@article{ 20090211850984,
	author = "Crispin Gomez Requena and Francisco Gilabert Villamon and Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "The fat-tree is one of the most widely-used topologies by interconnection network manufacturers. Recently, it has been demonstrated that a deterministic routing algorithm that optimally balances the network traffic can not only achieve almost the same performance than an adaptive routing algorithm but also outperforms it. On the other hand, fat-trees require a high number of switches with a non-negligible wiring complexity. In this paper, we propose replacing the fat - tree by a unidirectional multistage interconnection network (UMIN) that uses a traffic balancing deterministic routing algorithm. As a consequence, switch hardware is almost reduced to the half, decreasing, in this way, the power consumption, the arbitration complexity, the switch size itself, and the network cost. Preliminary evaluation results show that the UMIN with the load balancing scheme obtains lower latency than fat - tree for low and medium traffic loads. Furthermore, in networks with a high number of stages or with high radix switches, it obtains the same, or even higher, throughput than fat-tree. © 2006 IEEE.",
	address = "3 Park Avenue, 17th Floor, New York, NY 10016-5997, United States",
	issn = 15566056,
	journal = "IEEE Computer Architecture Letters",
	key = "Computer networks",
	keywords = "Adaptive algorithms;Interconnection networks;Internet;Metropolitan area networks;Routing algorithms;Switches;Switching circuits;Telecommunication networks;Trees;",
	note = "Butterfly network;Deterministic routing;Fat-trees;Multistage Interconnection networks;Traffic balancing;",
	number = 2,
	pages = "49 - 52",
	title = "{B}eyond fat - {T}ree: {U}nidirectional load - {B}alanced multistage interconnection network",
	url = "http://dx.doi.org/10.1109/L-CA.2008.8",
	volume = 7,
	year = 2008
}

Crispín Gomez, Francisco Gilabert, Maria E Gomez, Pedro Lopez and Jose Duato. Beyond Fat–tree: Unidirectional Load–Balanced Multistage Interconnection Network. Computer Architecture Letters 7(2):49 -52, 2008. URL, DOI BibTeX

@article{ 4544509,
	author = "Gomez, Crisp{\'i}n and Gilabert, Francisco and Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "The fat-tree is one of the most widely-used topologies by interconnection network manufacturers. Recently, it has been demonstrated that a deterministic routing algorithm that optimally balances the network traffic can not only achieve almost the same performance than an adaptive routing algorithm but also outperforms it. On the other hand, fat-trees require a high number of switches with a non-negligible wiring complexity. In this paper, we propose replacing the fat-tree by a unidirectional multistage interconnection network (UMIN) that uses a traffic balancing deterministic routing algorithm. As a consequence, switch hardware is almost reduced to the half, decreasing, in this way, the power consumption, the arbitration complexity, the switch size itself, and the network cost. Preliminary evaluation results show that the UMIN with the load balancing scheme obtains lower latency than fat-tree for low and medium traffic loads. Furthermore, in networks with a high number of stages or with high radix switches, it obtains the same, or even higher, throughput than fat-tree.",
	doi = "10.1109/L-CA.2008.8",
	issn = "1556-6056",
	journal = "Computer Architecture Letters",
	keywords = "adaptive routing algorithm;interconnection network manufacturers;network traffic;nonnegligible wiring complexity;power consumption;radix switches;traffic balancing deterministic routing algorithm;unidirectional load-balanced multistage interconnection net",
	month = "july-dec.",
	number = 2,
	pages = "49 -52",
	title = "{B}eyond {F}at--tree: {U}nidirectional {L}oad--{B}alanced {M}ultistage {I}nterconnection {N}etwork",
	url = "http://dx.doi.org/10.1109/L-CA.2008.8",
	volume = 7,
	year = 2008
}

R Tornero, J M Ordua, , Jose Flich and Jose Duato. CART: Communication-Aware Routing Technique for Application-Specific NoCs. In Digital System Design Architectures, Methods and Tools, 2008. DSD '08. 11th EUROMICRO Conference on. 2008, 26 -31. URL, DOI BibTeX

@conference{ 4669215,
	author = "R. Tornero and J.M. Ordua and , and Flich, Jose and Duato, Jose",
	abstract = "Networks on Chip (NoCs) have been shown as an efficient solution to the complex on-chip communication problems derived from the increasing number of processor cores. One of the key issues in the design of NoCs is the reduction of both area and power dissipation. As a result, two-dimensional meshes have become the preferred topology, since it offers low and constant link delay. Unfortunately, manufacturing defects or even real-time failures often make the resulting topology to become irregular, preventing the use of traditional routing algorithms. This scenario shows the need for topology-agnostic routing algorithms that provide a valid routing solution when applied over any topology. Moreover, in order to deal with run-time failures, the routing algorithm should be able to fit runtime constraints. This paper proposes a new communication-aware routing technique, referred to as CART, that optimizes the network performance for application-specific NoCs. CART combines a flexible, topology-agnostic routing algorithm with a communication-aware mapping technique that matches the traffic generated by the application with the available network bandwidth. Since the mapping technique can be pruned as needed in order to fit either quality function values or time constraints, CART can be adapted to fit with different computational costs. The evaluation results show that CART significatively improves network performance in terms of both latency and power consumption.",
	booktitle = "Digital System Design Architectures, Methods and Tools, 2008. DSD '08. 11th EUROMICRO Conference on",
	doi = "10.1109/DSD.2008.19",
	isbn = "978-0-7695-3277-6",
	keywords = "CART;application-specific NoC;communication-aware mapping technique;communication-aware routing technique;complex on-chip communication problems;network-on-chip;power dissipation;topology-agnostic routing algorithms;two-dimensional meshes;network routing;",
	month = "3-5",
	pages = "26 -31",
	title = "{CART}: {C}ommunication-{A}ware {R}outing {T}echnique for {A}pplication-{S}pecific {N}o{C}s",
	url = "http://dx.doi.org/10.1109/DSD.2008.19",
	year = 2008
}

Alberto Ros, M E Acacio and J M Garcia. DiCo-CMP: Efficient cache coherency in tiled CMP architectures. In Parallel and Distributed Processing, 2008. IPDPS 2008. IEEE International Symposium on. April 2008, 1 -11. URL, DOI BibTeX

@conference{ 4536287,
	author = "Ros, Alberto and M.E. Acacio and J.M. Garcia",
	abstract = "Future CMP designs that will integrate tens of processor cores on-chip will be constrained by area and power. Area constraints make impractical the use of a bus or a crossbar as the on-chip interconnection network, and tiled CMPs organized around a direct interconnection network will probably be the architecture of choice. Power constraints make impractical to rely on broadcasts (as Token-CMP does) or any other brute-force method for keeping cache coherence, and directory-based cache coherence protocols are currently being employed. Unfortunately, directory protocols introduce indirection to access directory information, which negatively impacts performance. In this work, we present DiCo-CMP, a novel cache coherence protocol especially suited to future tiled CMP architectures. In DiCo- CMP the role of storing up-to-date sharing information and ensuring totally ordered accesses for every memory block is assigned to the cache that must provide the block on a miss. Therefore, DiCo-CMP reduces the miss latency compared to a directory protocol by sending coherence messages directly from the requesting caches to those that must observe them (as it would be done in brute-force protocols), and reduces the network traffic compared to Token-CMP (and consequently, power consumption in the interconnection network) by sending just one request message for each miss. Using an extended version of GEMS simulator we show that DiCo-CMP achieves improvements in execution time of up to 8% on average over a directory protocol, and reductions in terms of network traffic of up to 42% on average compared to Token-CMP.",
	booktitle = "Parallel and Distributed Processing, 2008. IPDPS 2008. IEEE International Symposium on",
	doi = "10.1109/IPDPS.2008.4536287",
	issn = "1530-2075",
	keywords = "DiCo-CMP;cache coherency;directory-based cache coherence protocols;on-chip interconnection network;processor cores on-chip;up-to-date sharing information;cache storage;microprocessor chips;multiprocessor interconnection networks;",
	month = "april",
	pages = "1 -11",
	title = "{D}i{C}o-{CMP}: {E}fficient cache coherency in tiled {CMP} architectures",
	url = "http://dx.doi.org/10.1109/IPDPS.2008.4536287",
	year = 2008
}

Alejandro Martinez, George Apostolopoulos, Francisco J Alfaro, Jose L Sanchez and Jose Duato. Efficient deadline-based QoS algorithms for high-performance networks. IEEE Transactions on Computers 57(7):928 - 939, 2008. URL BibTeX

@article{ 20091912073930,
	author = "Alejandro Martinez and George Apostolopoulos and Francisco J. Alfaro and Jose L. Sanchez and Duato, Jose",
	abstract = "Quality of service (QoS) is becoming an attractive feature for high-performance networks and parallel machines because, in those environments, there are different traffic types, each one having its own requirements. In that sense, deadline-based algorithms can provide powerful QoS provision. However, the cost associated with keeping ordered lists of packets makes these algorithms impractical for high-performance networks. In this paper, we explore how to efficiently adapt the Earliest Deadline First family of algorithms to high-speed network environments. The results show excellent performance using just two virtual channels, FIFO queues, and a cost feasible with today's technology. {{\&}}copy; 2008 IEEE.",
	address = "445 Hoes Lane - P.O.Box 1331, Piscataway, NJ 08855-1331, United States",
	issn = 00189340,
	journal = "IEEE Transactions on Computers",
	key = "Quality of service",
	keywords = "Algorithms;Interconnection networks;Quality control;",
	note = "Earliest deadline firsts;Excellent performance;Fifo queues;High-performance networks;High-speed interconnection networks;High-speed networks;Parallel machines;Virtual channels;",
	number = 7,
	pages = "928 - 939",
	title = "{E}fficient deadline-based {Q}o{S} algorithms for high-performance networks",
	url = "http://dx.doi.org/10.1109/TC.2008.39",
	volume = 57,
	year = 2008
}

Samuel Rodrigo, Jose Flich, Jose Duato and M Hummel. Efficient unicast and multicast support for CMPs. In 2008 41st Annual IEEE/ACM International Symposium on Microarchitecture (MICRO-41). 2008, 364 - 75. URL BibTeX

@conference{ 10428961,
	author = "Rodrigo, Samuel and Flich, Jose and Duato, Jose and M. Hummel",
	abstract = "Beyond a certain number of cores, multi-core processing chips will require a network-on-chip (NoC) to interconnect the cores and overcome the limitations of a bus. NoCs must be carefully designed to meet constraints like power consumption, area, and ultra low latencies. Although 2D meshes with DOR (dimension-order-routing) meet these constraints, the need for partitioning (e.g. virtual machines, coherency domains) and traffic isolation may prevent the use of DOR routing. Also, core heterogeneity and manufacturing and run-time faults may lead to partially irregular topologies. Routing in these topologies is complex, and previously proposed solutions required routing tables, which drastically increase power consumption, area, and latency. The exception is LBDR (logic-based distributed routing), a flexible routing method for irregular topologies that removes the need for using routing tables (both at end-nodes and switches), thus achieving large savings in chip area and power consumption. But LBDR lacks support for multicast and broadcast, which are required to efficiently support cache coherence protocols both for single and multiple coherence domains. In this paper we propose bLBDR, an efficient multicast and broadcast mechanism built on top of LBDR. bLBDR performs multicast operations using a logic-based broadcast within a domain (a region with bounds). This allows us to isolate the traffic into different domains, thus enabling the concept of visualization at the NoC level. Also, bLBDR extends the concept of routing regions in LBDR by providing a mechanism that allows the flexible definition of multiple domains, sets of network resources. bLBDR fulfills all the practical requirements, including not only low latency and power and area efficiency, but also support for visualization, partitionability, fault-tolerance, traffic isolation and broadcast across the entire network as well as constrained to coherency domains or regions. All this is achieved by a small and power efficient routing logic (7{{\&}}times; area savings and 17{{\&}}times; power reduction when compared to a routing table in an 8 {{\&}}times; 8 mesh network).",
	address = "Piscataway, NJ, USA",
	booktitle = "2008 41st Annual IEEE/ACM International Symposium on Microarchitecture (MICRO-41)",
	journal = "2008 41st Annual IEEE/ACM International Symposium on Microarchitecture (MICRO-41)",
	keywords = "microprocessor chips;network topology;network-on-chip;power consumption;protocols;",
	note = "CMP;chip multiprocessors;multicore processing chips;network-on-chip;power consumption;dimension-order-routing;logic-based distributed routing;routing tables;cache coherence protocols;",
	pages = "364 - 75",
	title = "{E}fficient unicast and multicast support for {CMP}s",
	url = "http://dx.doi.org/10.1109/MICRO.2008.4771805",
	year = 2008
}

J M Montañana, Jose Flich and Jose Duato. Epoch-based reconfiguration: Fast, simple, and effective dynamic network reconfiguration. In Parallel and Distributed Processing, 2008. IPDPS 2008. IEEE International Symposium on. April 2008, 1 -12. URL, DOI BibTeX

@conference{ 4536298,
author = "Monta{\~n}ana, J. M. and Flich, Jose and Duato, Jose",
abstract = "Dynamic network reconfiguration is defined as the process of changing from one routing function to another while the network remains up and running. The main challenge is to avoid deadlocks and reduce packet dropping rate while keeping network service. Current approaches either require the existence of extra network resources like e.g. virtual channels, their complexity is so high that their practical applicability is limited, or they affect to the performance of the network during the reconfiguration process. In this paper we present EBR, a simple and fast method for dynamic network reconfiguration. EBR guarantees a fast and deadlock-free reconfiguration, but instead of avoiding deadlocks our mechanism is based on regressive deadlock recoveries. Thus, EBR allows cycles to be formed, and in the situation of a deadlock some packets may be dropped. However, as demonstrated, no packets need to be dropped in the working zone of the system. Also, the mechanism works in an asynchronous manner, does not require additional resources and works on any topology. In order to minimize the number of dropped packets, EBR uses an epoch marking system that guarantees that only packets potentially leading to a deadlock will be removed. Evaluation results show that EBR works efficiently in different topologies and with different routing algorithms. When compared with current proposals, EBR always gets the best numbers in all the analyzed parameters (dropped packets, latency, throughput, reconfiguration time and resources required), thus achieving the good properties of all mechanisms.",
booktitle = "Parallel and Distributed Processing, 2008. IPDPS 2008. IEEE International Symposium on",
doi = "10.1109/IPDPS.2008.4536298",
isbn = "978-1-4244-1693-6",
issn = "1530-2075",
keywords = "deadlock-free reconfiguration;dynamic network reconfiguration;epoch-based reconfiguration;network resource;network service;packet dropping rate;regressive deadlock recovery;routing algorithm;routing function;topology;computer networks;telecommunication ne",
month = "april",
pages = "1 -12",
title = "{E}poch-based reconfiguration: {F}ast, simple, and effective dynamic network reconfiguration",
url = "http://dx.doi.org/10.1109/IPDPS.2008.4536298",
year = 2008
}

J Forment, Francisco Gilabert, Antonio Robles, V Conejero, F Nuez and J Blanca. EST2uni: an open tool for parallel, automated EST analysis and database creation, with a powerful data mining tool. In 2nd International Conference on Bioinformatics Research and Development.. 2008, 67 - 72. BibTeX

@conference{ 11172720,
	author = "J. Forment and Gilabert, Francisco and Robles, Antonio and V. Conejero and F. Nuez and J. Blanca",
	abstract = "We present EST2uni, an integrated, highly-configurable EST analysis pipeline and data mining software package that automates the pre-processing, clustering, annotation, database creation, and data mining of EST collections. The pipeline uses Perl to run standard EST analysis tools, and the code has a modular design to facilitate the addition of new analytical methods and their configuration: Currently implemented analyses include functional and structural annotation, SNP and microsatellite discovery, integration of previously known genetic marker data and gene expression results, and assistance in cDNA microarray design. It can be run in parallel in a PC cluster in order to reduce the time necessary for the analysis. It uses PHP to create a Web site linked to the database, showing collection statistics, with complex query capabilities and tools for data mining and retrieval. The code is freely available under the GPL license and is under active development to incorporate new analyses, methods, and algorithms as they are released by the bioinformatics community.",
	address = "Linz, Austria",
	booktitle = "2nd International Conference on Bioinformatics Research and Development.",
	journal = "2nd International Conference on Bioinformatics Research and Development, Poster Presentations",
	keywords = "biology computing;data mining;information retrieval;software packages;software tools;Web sites;",
	note = "EST2uni;automated EST analysis pipeline;parallel EST analysis pipeline;database creation;data mining tool;data mining software package;Perl;microsatellite discovery;genetic marker data;cDNA microarray design;PHP;Web site;data retrieval;",
	pages = "67 - 72",
	title = "{EST}2uni: an open tool for parallel, automated {EST} analysis and database creation, with a powerful data mining tool",
	year = 2008
}

Crispín Gomez, Maria E Gomez, Pedro Lopez and Jose Duato. Exploiting Wiring Resources on Interconnection Network: Increasing Path Diversity. In Parallel, Distributed and Network-Based Processing, 2008. PDP 2008. 16th Euromicro Conference on. 2008, 20 -29. URL, DOI BibTeX

@conference{ 4457100,
	author = "Gomez, Crisp{\'i}n and Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "On-chip networks are the answer to the growing demands for high communication performance of chip multiprocessors. These networks have a number of characteristics that make their design quite different to off-chip networks. In particular, wires are an abundant available resource inside the chip. In this paper, we explore how to organize the huge wiring capabilities available in on-chip networks. In particular, we analyze the option of distributing the wires among several parallel links connecting the same two switches. This technique is known as Space Division Multiplexing (SDM). The number of parallel sub-links and their width are two key parameters that are studied together with the relationship with the mean packet size. The paper shows that SDM is a technique to take into account in on-chip networks since it allows to highly increase the network accepted traffic at the expense of a small latency increase or even no increase. Moreover, in some networks, it allows to reduce the network hardware, providing simiar performance results, which results in a reduction in the consumption of area and power.",
	booktitle = "Parallel, Distributed and Network-Based Processing, 2008. PDP 2008. 16th Euromicro Conference on",
	doi = "10.1109/PDP.2008.33",
	isbn = "978-0-7695-3089-5",
	issn = "1066-6192",
	keywords = "chip multiprocessors;interconnection network;mean packet size;on-chip networks;parallel links;path diversity;space division multiplexing;wiring capabilities;wiring resources;multiprocessor interconnection networks;space division multiplexing;wiring;",
	month = "feb.",
	pages = "20 -29",
	title = "{E}xploiting {W}iring {R}esources on {I}nterconnection {N}etwork: {I}ncreasing {P}ath {D}iversity",
	url = "http://dx.doi.org/10.1109/PDP.2008.33",
	year = 2008
}

Crispín Gomez, Maria E Gomez, Pedro Lopez and Jose Duato. Exploiting wiring resources on interconnection network: Increasing path diversity. In Parallel, Distributed and Network-Based Processing, 2008. PDP 2008. 16th Euromicro Conference on. 2008, 20 - 29. URL BibTeX

@conference{ 20083011395413,
	author = "Gomez, Crisp{\'i}n and Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "On-chip networks are the answer to the growing demands for high communication performance of chip multiprocessors. These networks have a number of characteristics that make their design quite different to off-chip networks. In particular, wires are an abundant available resource inside the chip. In this paper, we explore how to organize the huge wiring capabilities available in on-chip networks. In particular, we analyze the option of distributing the wires among several parallel links connecting the same two switches. This technique is known as Space Division Multiplexing (SDM). The number of parallel sub-links and their width are two key parameters that are studied together with the relationship with the mean packet size. The paper shows that SDM is a technique to take into account in on-chip networks since it allows to highly increase the network accepted traffic at the expense of a small latency increase or even no increase. Moreover, in some networks, it allows to reduce the network hardware, providing similar performance results, which results in a reduction in the consumption of area and power. © 2008 IEEE.",
	address = "Toulouse, France",
	booktitle = "Parallel, Distributed and Network-Based Processing, 2008. PDP 2008. 16th Euromicro Conference on",
	journal = "Proceedings of the 16th Euromicro Conference on Parallel, Distributed and Network-Based Processing, PDP 2008",
	key = "Space division multiple access",
	keywords = "Electric network topology;Internet;Telecommunication;Wire;",
	note = "Chip multi processor (CMP);Communication performances;Key parameters;Latency increase;Off chip;On Chip Network (OCN);Packet size (PS);Parallel links;Path diversity;Performance results;Space division multiplexing (SDM);",
	pages = "20 - 29",
	title = "{E}xploiting wiring resources on interconnection network: {I}ncreasing path diversity",
	url = "http://dx.doi.org/10.1109/PDP.2008.33",
	year = 2008
}

Francisco Gilabert, S Medardoni, D Bertozzi, L Benini, Maria E Gomez, Pedro Lopez and Jose Duato. Exploring high-dimensional topologies for NoC design through an integrated analysis and synthesis framework. 2008, 107 - 16. BibTeX

@conference{ 9940710,
	author = "Gilabert, Francisco and S. Medardoni and D. Bertozzi and L. Benini and Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "Networks-on-chip (NoCs) address the challenge to provide scalable communication bandwidth to tiled architectures in a power-efficient fashion. The 2-D mesh is currently the most popular regular topology used for on-chip networks in tile-based architectures, because it perfectly matches the 2-D silicon surface and is easy to implement. However, a number of limitations have been proved in the open literature, especially for long distance traffic. Two relevant variants of 2-D meshes are explored in this paper: high-dimensional and concentrated topologies. The novelty of our exploration framework includes the use of fast and accurate transaction level simulation to provide constraints to the physical synthesis flow, which is integrated with standard industrial toolchains for accurate physical implementation. Interestingly, this work illustrates how effectively the compared topologies can handle synchronization-intensive traffic patterns and accounts for chip I/O interfaces.",
	address = "Piscataway, NJ, USA",
	journal = "2008 2nd ACM/IEEE International Symposium on Networks-on-Chip (NOCS '08)",
	keywords = "integrated circuit design;logic design;network topology;network-on-chip;",
	note = "NoC design;networks-on-chip;2D mesh topology;on-chip networks;tile-based architectures;industrial toolchains;chip I/O interfaces;",
	pages = "107 - 16",
	title = "{E}xploring high-dimensional topologies for {N}o{C} design through an integrated analysis and synthesis framework",
	year = 2008
}

Ricardo Fernandez-Pascual, Jose M Garcia, Manuel E Acacio and Jose Duato. Extending the tokenCMP cache coherence protocol for low overhead fault tolerance in CMP architectures. IEEE Transactions on Parallel and Distributed Systems 19(8):1044 - 1056, 2008. URL BibTeX

@article{ 20083011390050,
	author = "Ricardo Fernandez-Pascual and Jose M. Garcia and Manuel E. Acacio and Duato, Jose",
	abstract = "It is widely accepted that transient failures will appear more frequently in chips designed in the near future due to several factors such as the increased integration scale. On the other hand, Chip-multiprocessors (CMP) that integrate several processor cores in a single chip are nowadays the best alternative to more efficient use of the increasing number of transistors that can be placed in a single die. Hence, it is necessary to design new techniques to deal with these faults to be able to build sufficiently reliable Chip Multiprocessors (CMPs). In this work, we present a coherence protocol aimed at dealing with transient failures that affect the interconnection network of a CMP, thus assuming that the network is no longer reliable. In particular, our proposal extends a token-based cache coherence protocol so that no data can be lost and no deadlock can occur due to any dropped message. Using GEMS full system simulator, we compare our proposal against TokenCMP. We show that in absence of failures our proposal does not introduce overhead in terms of increased execution time over TokenCMP. Additionally, our protocol can tolerate message loss rates much higher than those likely to be found in the real world without increasing execution time more than 15 percent. {{\&}}copy; 2008 IEEE.",
	address = "445 Hoes Lane - P.O.Box 1331, Piscataway, NJ 08855-1331, United States",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Network architecture",
	keywords = "Coherent light;Fault tolerance;Microprocessor chips;Multiprocessing systems;Nanotechnology;Quality assurance;Reliability;",
	note = "Cache coherence protocols;Chip multi processor (CMP);Chip multi-processors (CMP);CMP architectures;Coherence protocols;Execution time;Low overhead;Message loss;New techniques;Processor cores;Real world;Single chips;System simulators;",
	number = 8,
	pages = "1044 - 1056",
	title = "{E}xtending the token{CMP} cache coherence protocol for low overhead fault tolerance in {CMP} architectures",
	url = "http://dx.doi.org/10.1109/TPDS.2007.70803",
	volume = 19,
	year = 2008
}

Ricardo Fernandez-Pascual, Jose M Garcia, Manuel E Acacio and Jose Duato. Fault-tolerant cache coherence protocols for CMPs: Evaluation and trade-offs. 2008, 555 - 568. URL BibTeX

@conference{ 20090511881194,
	author = "Ricardo Fernandez-Pascual and Jose M. Garcia and Manuel E. Acacio and Duato, Jose",
	abstract = "One way of dealing with transient faults that will affect the interconnection network of future large-scale Chip Multiprocessor (CMP) systems is by extending the cache coherence protocol. Fault tolerance at the level of the cache coherence protocol has been proven to achieve very low performance overhead in absence of faults while being able to support very high fault rates. In this work, we compare two already proposed fault-tolerant cache coherence protocols in a common framework and present a new one based in the cache coherence protocol used in AMD Opteron processors. Also, we thoroughly evaluate the performance of the three protocols, show how to adjust the fault tolerance parameters of the protocols to achieve a desired level of fault tolerance and measure the overhead achieved to be able to support very high transient fault rates. {{\&}}copy; 2008 Springer Berlin Heidelberg.",
	address = "Bangalore, India",
	issn = 03029743,
	journal = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
	key = "Quality assurance",
	keywords = "Coherent light;Errors;Failure analysis;Fault tolerance;Fault tree analysis;High performance liquid chromatography;Reliability;",
	note = "Cache coherence protocols;Chip multi processor (CMP);Fault rates;Fault-tolerant;One way;Opteron processors;Transient faults;",
	pages = "555 - 568",
	title = "{F}ault-tolerant cache coherence protocols for {CMP}s: {E}valuation and trade-offs",
	url = "http://dx.doi.org/10.1007/978-3-540-89894-8_48",
	volume = "5374 LNCS",
	year = 2008
}

Jesus Escudero-Sahuquillo, Pedro Garcia, Francisco Quiles, Jose Flich and Jose Duato. FBICM: Efficient congestion management for high-performance networks using distributed deterministic routing. In High Performance Computing - HiPC 2008 5374 LNCS. 2008, 503 - 517. URL, DOI BibTeX

@conference{ 20090511881191,
	author = "Jesus Escudero-Sahuquillo and Pedro Garcia and Francisco Quiles and Flich, Jose and Duato, Jose",
	abstract = "As the number of components in cluster-based systems increases, cost and power consumption also increase. One way to reduce both problems is using smaller networks with adequate congestion management mechanisms. Recent successful proposals (RECN) eliminate the negative effects of congestion, the Head-of-Line (HOL) blocking, leaving congestion harmless. RECN relies on source-based networks architectures, where the entire route is placed at packet headers before injection. Unfortunately, distributed table-based routing is also common in cluster-based networks, being InfiniBand the most prominent example. We propose a novel congestion management technique for distributed table-based routing. The mechanism relies on additional congestion information located at routing tables. With this information HOL blocking is minimized by smartly using switch queues. Detailed memory organization and the way congestion information is updated/propagated is described. Preliminary results indicate that with modest resource requirements maximum network performance is kept regardless of congestion. © 2008 Springer Berlin Heidelberg.",
	address = "Bangalore, India",
	booktitle = "High Performance Computing - HiPC 2008",
	doi = "10.1007/978-3-540-89894-8_44",
	issn = 03029743,
	journal = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
	key = "Network management",
	keywords = "High performance liquid chromatography;Industrial management;Network performance;Parallel processing systems;Systems engineering;",
	note = "Congestion control;Congestion management;Distributed routing;Head of line (HOL) blocking;High-performance interconnects;InfiniBand (CO);Memory organizations;Negative effects;Network performances;One way;Packet headers;Power consumption (CE);Resource requirements;",
	pages = "503 - 517",
	title = "{FBICM}: {E}fficient congestion management for high-performance networks using distributed deterministic routing",
	url = "http://dx.doi.org/10.1007/978-3-540-89894-8_44",
	volume = "5374 LNCS",
	year = 2008
}

H Eberle, P J Garcia, Jose Flich, Jose Duato, R Drost, N Gura, D Hopkins and W Olesinski. High-radix crossbar switches enabled by Proximity Communication. In High Performance Computing, Networking, Storage and Analysis, 2008. SC 2008. International Conference for. 2008, 1 -12. DOI BibTeX

@conference{ 5219754,
	author = "H. Eberle and P.J. Garcia and Flich, Jose and Duato, Jose and R. Drost and N. Gura and D. Hopkins and W. Olesinski",
	abstract = "We describe a novel way to implement high-radix crossbar switches. Our work is enabled by a new chip interconnect technology called proximity communication (PxC) that offers unparalleled chip IO density. First, we show how a crossbar architecture is topologically mapped onto a PxC-enabled multi-chip module (MCM). Then, we describe a first prototype implementation of a small-scale switch based on a PxC MCM. Finally, we present a performance analysis of two large-scale switch configurations with 288 ports and 1,728 ports, respectively, contrasting a 1-stage PxC-enabled switch and a multi-stage switch using conventional technology. Our simulation results show that (a) arbitration delays in a large 1-stage switch can be considerable, (b) multi-stage switches are extremely susceptible to saturation under non-uniform traffic, a problem that becomes worse for higher radices (1-stage switches, in contrast, are not affected by this problem).",
	booktitle = "High Performance Computing, Networking, Storage and Analysis, 2008. SC 2008. International Conference for",
	doi = "10.1109/SC.2008.5219754",
	keywords = "PxC-enabled switch;chip interconnect technology;crossbar architecture;high-radix crossbar switches;multichip module;multistage switch;proximity communication;small-scale switch;unparalleled chip IO density;multichip modules;multiprocessor interconnection",
	month = "15-21",
	pages = "1 -12",
	title = "{H}igh-radix crossbar switches enabled by {P}roximity {C}ommunication",
	year = 2008
}

Blas Cuesta Sáez, Antonio Robles and Jose Duato. Improving token coherence by multicast coherence messages. 2008, 269 - 73. URL BibTeX

@conference{ 9904937,
	author = "Cuesta S{\'a}ez, Blas and Robles, Antonio and Duato, Jose",
	abstract = "Token coherence is a cache coherence protocol that joins the main advantages of traditional protocols. However, unlike them, token coherence does not handle messages in order, which may lead to races, causing some cache misses not to be solved. To assure their completion, an inefficient mechanism named persistent requests is used. Recently we have proposed the priority request mechanism to efficiently handle races. As acknowledgements are not required, a single node can solve several misses for the same memory block at the same time. When solving a lot of misses, the node may become a bottleneck. To avoid it, in this work we propose the multicast coherence message, which allows to simultaneously resolve several misses by using only one response message. It reduces the network traffic and the average response latency, improving significantly the overall performance.",
	address = "Piscataway, NJ, USA",
	journal = "2008 16th Euromicro Conference on Parallel, Distributed and Network-based Processing - PDP '08",
	keywords = "cache storage;multicast protocols;routing protocols;",
	note = "token coherence;multicast coherence messages;cache coherence protocol;priority request mechanism;network traffic;average response latency;",
	pages = "269 - 73",
	title = "{I}mproving token coherence by multicast coherence messages",
	url = "http://dx.doi.org/10.1109/PDP.2008.36",
	year = 2008
}

Jose Flich and Jose Duato. Logic-Based Distributed Routing for NoCs. Computer Architecture Letters 7(1):13 -16, 2008. DOI BibTeX

@article{ 4407676,
	author = "Flich, Jose and Duato, Jose",
	abstract = "The design of scalable and reliable interconnection networks for multicore chips (NoCs) introduces new design constraints like power consumption, area, and ultra low latencies. Although 2D meshes are usually proposed for NoCs, heterogeneous cores, manufacturing defects, hard failures, and chip virtualization may lead to irregular topologies. In this context, efficient routing becomes a challenge. Although switches can be easily configured to support most routing algorithms and topologies by using routing tables, this solution does not scale in terms of latency and area. We propose a new circuit that removes the need for using routing tables. The new mechanism, referred to as logic-based distributed routing (LBDR), enables the implementation in NoCs of many routing algorithms for most of the practical topologies we might find in the near future in a multicore chip. From an initial topology and routing algorithm, a set of three bits per switch output port is computed. By using a small logic block, LHDR mimics (demonstrated by evaluation) the behavior of routing algorithms implemented with routing tables. This result is achieved both in regular and irregular topologies. Therefore, LBDR removes the need for using routing tables for distributed routing, thus enabling flexible, fast and power-efficient routing in NoCs.",
	doi = "10.1109/L-CA.2007.16",
	issn = "1556-6056",
	journal = "Computer Architecture Letters",
	keywords = "NoC;chip virtualization;heterogeneous cores;interconnection network reliability;logic-based distributed routing;manufacturing defects;networks for multicore chips;circuit reliability;interconnections;logic circuits;network routing;network topology;network",
	month = "january-june",
	number = 1,
	pages = "13 -16",
	title = "{L}ogic-{B}ased {D}istributed {R}outing for {N}o{C}s",
	volume = 7,
	year = 2008
}

Héctor Montaner, Vicente Santonja, Federico Silla and Jose Duato. Network reconfiguration suitability for scientific applications. In Parallel Processing, 2008. ICPP '08. 37th International Conference on. 2008, 312 - 319. URL, DOI BibTeX

@conference{ 10207626,
	author = "Montaner, H{\'e}ctor and Santonja, Vicente and Silla, Federico and Duato, Jose",
	abstract = "This paper analyzes the communication pattern of several scientific applications and how they can make profit of network reconfiguration in order to adapt network topology to the communication needs so that total execution time is reduced. By using an analysis methodology based on real application executions, we study the variation of the required communication bandwidth with time and also the global interprocedural communication patterns. Results show that required bandwidth between each pair of processes does not significantly fluctuates, leading to a constant use of the links and therefore discouraging dynamic reconfigurations of the network during execution time. Nevertheless, the group of busy links changes with each application showing a different communication graph for each of them. Thus, execution time may be accelerated by using an ad-hoc topology, that is, reconfiguring the network before the execution of the application in order to adapt it to the application needs.",
	address = "Piscataway, NJ, USA",
	booktitle = "Parallel Processing, 2008. ICPP '08. 37th International Conference on",
	doi = "10.1109/ICPP.2008.58",
	journal = "2008 37th International Conference on Parallel Processing (ICPP)",
	keywords = "ad hoc networks;application program interfaces;message passing;natural sciences computing;telecommunication network topology;",
	note = "network reconfiguration suitability;scientific applications;network topology;global interprocedural communication patterns;communication graph;ad-hoc topology;message passing interface;",
	pages = "312 - 319",
	title = "{N}etwork reconfiguration suitability for scientific applications",
	url = "http://dx.doi.org/10.1109/ICPP.2008.58",
	year = 2008
}

Joan-Lluis Ferrer, Elvira Baydal, Antonio Robles, Pedro Lopez and Jose Duato. On the influence of the packet marking and injection control schemes in congestion management for MINs. 2008, 930 - 9. URL BibTeX

@conference{ 10528096,
	author = "Ferrer, Joan-Lluis and Baydal, Elvira and Robles, Antonio and Lopez, Pedro and Duato, Jose",
	abstract = "Several Congestion Management Mechanisms (CMMs) have been proposed for Multistage Interconnection Networks (MINs) in order to avoid the degradation of network performance when congestion appears. Most of them are based on Explicit Congestion Notification (ECN). For this purpose, switches detect congestion and, depending on the applied mechanism, some flags are marked to warn the source hosts. In response, source hosts apply corrective actions to adjust their packet injection rate. These mechanisms have been evaluated by analyzing whether they are able to manage a congestion situation but there is not a comparison study among them. Moreover, marking effects are not separately analyzed from corrective actions. In this paper, we analyze the current proposals for CMMs, showing the impact of the applied packet marking techniques as well as the corrective actions they apply.",
	address = "Berlin, Germany",
	journal = "Euro-Par 2008 Parallel Processing. 14th International Euro-Par Conference",
	keywords = "multistage interconnection networks;packet switching;telecommunication congestion control;",
	note = "packet marking;injection control schemes;congestion management mechanisms;multistage interconnection networks;explicit congestion notification;message throttling;",
	pages = "930 - 9",
	title = "{O}n the influence of the packet marking and injection control schemes in congestion management for {MIN}s",
	url = "http://dx.doi.org/10.1007/978-3-540-85451-7_100",
	year = 2008
}

Jose Flich, Samuel Rodrigo, Jose Duato, T Sodring, A G Solheim, T Skeie and O Lysne. On the Potential of NoC Virtualization for Multicore Chips. In Complex, Intelligent and Software Intensive Systems, 2008. CISIS 2008. International Conference on. 2008, 801 -807. DOI BibTeX

@conference{ 4606771,
	author = "Flich, Jose and Rodrigo, Samuel and Duato, Jose and T. Sodring and A.G. Solheim and T. Skeie and O. Lysne",
	abstract = "As the end of Moores-law is on the horizon, power becomes a limiting factor to continuous increases in performance gains for single-core processors. Processor engineers have shifted to the multicore paradigm and many-core processors are a reality. Within the context of these multi-core chips, three key metrics point themselves out as being of major importance, performance, fault-tolerance (including yield), and power consumption. A solution that optimizes all three of these metrics is challenging. As the number of cores increases the importance of the interconnection network-on-chip (NoC) grows as well, and chip designers should aim to optimize these three key metrics in the NoC context as well. In this paper we identify and discuss the main properties that a NoC must exhibit in order to enable such optimizations. In particular, we propose the use of virtualization techniques at the NoC level. AS a major finding, we identify the implementation of routing algorithms to become a key design parameter in order to achieve an effective virtualization of the chip should also supporting broadcast within the virtualized context. The intention behind this paper is for it to serve as a position paper on the topic of virtualization for NoC and the challenges that should be met at the routing layer in order to maximize performance, fault-tolerance and power consumption in multicore chips.",
	booktitle = "Complex, Intelligent and Software Intensive Systems, 2008. CISIS 2008. International Conference on",
	doi = "10.1109/CISIS.2008.97",
	keywords = "Moores-law;NoC virtualization;interconnection network-on-chip;many-core processors;multicore chips;routing algorithms;single-core processors;microprocessor chips;multiprocessor interconnection networks;network-on-chip;",
	month = "4-7",
	pages = "801 -807",
	title = "{O}n the {P}otential of {N}o{C} {V}irtualization for {M}ulticore {C}hips",
	year = 2008
}

, Jose Flich and Jose Duato. On the Potentials of Segment-Based Routing for NoCs. In Parallel Processing, 2008. ICPP '08. 37th International Conference on. 2008, 594 -603. URL, DOI BibTeX

@conference{ 4625898,
author = ", and Flich, Jose and Duato, Jose",
abstract = "The topology, the routing algorithm and the way the traffic pattern is distributed over the network influence the ultimate performance of the interconnection network. Off-chip high-performance interconnects provide mechanisms to support irregular topologies, whereas in on-chip networks the topology is fixed at design time. Continuous trend on device miniaturization and high volume manufacturing increase the probability of faults in embedded systems, leading to irregular topologies. Also, partitionability and virtualization of the entire on-chip network is envisioned for future systems. These trends lead to the need of routing algorithms that adapt to the static or dynamic changes in irregular topologies.In this paper we analyze the benefits of the reconfiguration at the routing algorithm level in order to allow topology changes. That is, support topology changes that appear on the network due to different reasons including switch or link failures, energy reduction decisions or design and manufacturing issues. We perform an exhaustive analysis on the performance impact of the routing algorithm in a NoC system. Our aim is to enable the possibility of reconfiguration of the routing algorithm. We take advantage on the flexibility offered by the segment-based routing methodology that allows a fast computation of many deadlock-free routing algorithms by obtaining different segmentation processes and routing restriction policies. This study analyzes the potentials offered by SR. Results show that the election of the routing algorithm may greatly affect the final performance of the network. Additionally, we propose an organized segmentation process that achieves reliable performance with low variability for all topologies studied under uniform traffic conditions. These results encourages us to the search of a dynamic mechanism that adapts the routing algorithm to the traffic.",
booktitle = "Parallel Processing, 2008. ICPP '08. 37th International Conference on",
doi = "10.1109/ICPP.2008.56",
issn = "0190-3918",
keywords = "NoC;deadlock-free routing algorithms;embedded systems;interconnection network;off-chip high-performance interconnects;routing algorithm;segment-based routing;segment-based routing methodology;traffic pattern;uniform traffic conditions;interconnections;net",
month = "9-12",
pages = "594 -603",
title = "{O}n the {P}otentials of {S}egment-{B}ased {R}outing for {N}o{C}s",
url = "http://dx.doi.org/10.1109/ICPP.2008.56",
year = 2008
}

A Martinez, F J Alfaro, J L Sanchez and Jose Duato. Providing full QoS with 2 VCs in high-speed switches. 2008, 345 - 354. URL BibTeX

@conference{ 20090111835975,
	author = "A. Martinez and F.J. Alfaro and J.L. Sanchez and Duato, Jose",
	abstract = "Current interconnect standards propose 16 or even more virtual channels (VCs) for provision of quality of service (QoS). However, VCs increase the complexity of the switch and the scheduling delays. In a previous paper, we have shown how to use only two VCs for full QoS support at the switches. In this paper, we explore thoroughly two alternative switch designs that take advantage of this reduction. We analyze their feasibility in a single chip implementation and show that they get a noticeable performance while greatly reducing the cost and power consumption of the network. {{\&}}copy; 2008 Springer Berlin Heidelberg.",
	address = "Estoril, Portugal",
	issn = 03029743,
	journal = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
	key = "Quality of service",
	keywords = "Diesel engines;Paper;Switches;Ubiquitous computing;",
	note = "Current interconnect;Power consumptions;Qos supports;Single chips;Speed switches;Switch designs;Virtual channels;",
	pages = "345 - 354",
	title = "{P}roviding full {Q}o{S} with 2 {VC}s in high-speed switches",
	url = "http://dx.doi.org/10.1007/978-3-540-89524-4-35",
	volume = "5200 LNCS",
	year = 2008
}

Crispín Gomez, Maria E Gomez, Pedro Lopez and Jose Duato. Reducing packet dropping in a bufferless NoC. In Euro-Par 2008 – Parallel Processing. 2008, 899 - 909. URL BibTeX

@conference{ 10528093,
	author = "Gomez, Crisp{\'i}n and Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "Networks on chip (NoCs) has a strong impact on overall chip performance. Interconnection bandwidth is limited by the critical path delay. Recent works show that the critical path includes the switch input buffer control logic. As a consequence, by removing buffers, switch clock frequency can be doubled. Recently, a new switching technique for NoCs called blind packet switching (BPS) has been proposed. It is based on replacing the buffers of the switch ports by simple latches. Since buffers consume a high percentage of switch power and area, BPS not only improves performance but also helps in reducing power and area. In BPS there are no buffers at the switch ports, so packets can not be stopped. If the required output port is busy, the packet will be dropped. In order to prevent packet dropping, some techniques based on resource replication has been proposed. In this paper, we propose some alternative and complementary techniques that does not rely on resource replication. By using these techniques, packet dropping and its negative effects are highly reduced. In particular, packet dropping is completely removed for a very wide network traffic range. The first dropped packet appears at a 11.6 higher traffic load. As a consequence, network throughput is increased and the packet latency is kept almost constant.",
	address = "Berlin, Germany",
	booktitle = "Euro-Par 2008 – Parallel Processing",
	journal = "Euro-Par 2008 Parallel Processing. 14th International Euro-Par Conference",
	keywords = "delays;multiprocessor interconnection networks;network-on-chip;packet switching;",
	note = "bufferless NoC;networks on chip;interconnection bandwidth;buffer control logic;blind packet switching;resource replication;packet dropping;network traffic load;critical path delay;",
	pages = "899 - 909",
	title = "{R}educing packet dropping in a bufferless {N}o{C}",
	url = "http://dx.doi.org/10.1007/978-3-540-85451-7_97",
	year = 2008
}

Crispín Gomez, Francisco Gilabert, Maria E Gomez, Pedro Lopez and Jose Duato. RUFT: Simplifying the fat-tree topology. In Parallel and Distributed Systems, 2008. ICPADS '08. 14th IEEE International Conference on. 2008, 153 - 160. URL BibTeX

@conference{ 20090911931135,
	author = "Gomez, Crisp{\'i}n and Gilabert, Francisco and Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "The fat-tree is one of the most widely-used topologies by interconnection network manufacturers. Recently, a deterministic routing algorithm that optimally balances the network traffic in fat-trees was proposed. It can not only achieve almost the same performance than adaptive routing, but also outperforms it for some traffic patterns. Nevertheless, fat-trees require a high number of switches with a non-negligible wiring complexity. In this paper, we propose replacing the fat-tree by an unidirectional multistage interconnection network referred to as Reduced Unidirectional Fat-tree (RUFT) that uses a a simplified version of the aforementioned deterministic routing algorithm. As a consequence, switch hardware is almost reduced to the half, decreasing, in this way, power consumption, arbitration complexity, switch size, and network cost. Evaluation results show that RUFT obtains lower latency than fat-tree for low and medium traffic loads. Furthermore, in large networks, it obtains almost the same throughput than the classical fat-tree. {{\&}}copy; 2008 IEEE.",
	address = "Melbourne, VIC, Australia",
	booktitle = "Parallel and Distributed Systems, 2008. ICPADS '08. 14th IEEE International Conference on",
	issn = 15219097,
	journal = "Proceedings of the International Conference on Parallel and Distributed Systems - ICPADS",
	key = "Trees (mathematics)",
	keywords = "Interconnection networks;Internet;Routing algorithms;Switches;Switching circuits;",
	note = "Adaptive routing;Deterministic routing algorithms;Evaluation results;Large networks;Multi-stage interconnection networks;Network costs;Network traffics;Number of switches;Power consumption;Switch sizes;Traffic loads;Traffic patterns;Tree topologies;",
	pages = "153 - 160",
	title = "{RUFT}: {S}implifying the fat-tree topology",
	url = "http://dx.doi.org/10.1109/ICPADS.2008.44",
	year = 2008
}

Blas Cuesta Sáez, Antonio Robles and Jose Duato. Switch-based packing technique for improving token coherence scalability. 2008, 80 - 87. URL BibTeX

@conference{ 20090411871352,
	author = "Cuesta S{\'a}ez, Blas and Robles, Antonio and Duato, Jose",
	abstract = "Traditional cache coherence protocols either provide low latency cache misses (snooping protocols) or bandwidth efficiency (directory protocols). To simultaneously capture the best attributes of traditional protocols, Token Coherence has been recently proposed. This protocol can quickly resolve cache misses by transient requests. However, since transient requests are unordered messages, they may sometimes fail in solving cache misses mainly due to the occurrence of protocol races. Thus, when the completion of cache misses is not possible by transient requests, Token Coherence uses a starvation prevention mechanism to ensure their completion. Although several implementation options of starvation prevention mechanisms have been proposed, all of them are broadcast-based. This fact represents a large detriment to the Token Coherence scalability. To tackle this problem, in this work we apply a switchbased packing technique that alleviates the harm of broadcast messages and improves the protocol scalability. © 2008 IEEE.",
	address = "Dunedin, Otago, New zealand",
	journal = "Parallel and Distributed Computing, Applications and Technologies, PDCAT Proceedings",
	key = "Coherent light",
	keywords = "Multiprocessing systems;Scalability;",
	note = "Bandwidth efficiencies;Broadcast messages;Cache coherence protocols;Cache misses;Directory protocols;Low latencies;Packing techniques;Protocol scalabilities;Token coherences;",
	pages = "80 - 87",
	title = "{S}witch-based packing technique for improving token coherence scalability",
	url = "http://dx.doi.org/10.1109/PDCAT.2008.25",
	year = 2008
}

Tor Skeie, Daniel Ortega, Jose Flich and Raimir Holanda. Topic 13: High-performance networks. In Euro-Par 2008 – Parallel Processing 5168 LNCS. 2008, 898 -. URL BibTeX

@conference{ 20083911589414,
	author = "Tor Skeie and Daniel Ortega and Flich, Jose and Raimir Holanda",
	abstract = "The communication network is the key component of every parallel and distributed system. The trend of always aiming at bigger and more complex cores has shifted towards having many simpler cores, sharing yet another complex communication layer at the chip level. Moreover, advancements on scaling out at the cluster level have pushed communication and storage networks to new limits. All these technological opportunities bring out new and exciting research challenges. © 2008 Springer-Verlag Berlin Heidelberg.",
	address = "Las Palmas de Gran Canaria, Spain",
	booktitle = "Euro-Par 2008 – Parallel Processing",
	issn = 03029743,
	journal = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
	key = "Chlorine compounds",
	keywords = "Data storage equipment;",
	note = "Chip-level;Communication networks;Complex communication;Complex cores;Distributed systems;High-performance networks;Parallel processing;Research challenges;Storage networks;Technological opportunities;",
	pages = "898 -",
	title = "{T}opic 13: {H}igh-performance networks",
	url = "http://dx.doi.org/10.1007/978-3-540-85451-7_96",
	volume = "5168 LNCS",
	year = 2008
}

Scott Pakin, Craig Stunkel, Jose Flich, Francisco Alfaro, Gheorghe Almasi, Angelos Bilas, Ron Brightwell, Darius Buntinas, Wu-Chun Feng, Mitchell Gusat, Nectarios Koziris, Pedro Lopez, Andrew Lumsdaine, Jarek Nieplocha, Greg Pfister, Jamie Riotto, Vikram Saletore, Evan Speight, Pete Wyckoff, D K Panda, Jose Duato and Mazin Yousif. Workshop 9 Introduction: The Workshop on Communication Architecture for Clusters - CAC 2008. IPDPS Miami 2008 - Proceedings of the 22nd IEEE International Parallel and Distributed Processing Symposium, Program and CD-ROM, pages IEEE Computer Societ, 2008. URL BibTeX

@article{ 20083711535136,
	author = "Scott Pakin and Craig Stunkel and Flich, Jose and Francisco Alfaro and Gheorghe Almasi and Angelos Bilas and Ron Brightwell and Darius Buntinas and Wu-Chun Feng and Mitchell Gusat and Nectarios Koziris and Lopez, Pedro and Andrew Lumsdaine and Jarek Nieplocha and Greg Pfister and Jamie Riotto and Vikram Saletore and Evan Speight and Pete Wyckoff and D.K. Panda and Duato, Jose and Mazin Yousif",
	abstract = "No abstract available",
	address = "Miami, FL, United states",
	journal = "IPDPS Miami 2008 - Proceedings of the 22nd IEEE International Parallel and Distributed Processing Symposium, Program and CD-ROM",
	pages = "IEEE Computer Societ",
	title = "{W}orkshop 9 {I}ntroduction: {T}he {W}orkshop on {C}ommunication {A}rchitecture for {C}lusters - {CAC} 2008",
	url = "http://dx.doi.org/10.1109/IPDPS.2008.4536118",
	year = 2008
}

S Rueda, P Morillo, J M Orduna and Jose Duato. A genetic approach for adding QoS to distributed virtual environments. Computer Communications 30(4):731 - 739, 2007. URL BibTeX

@article{ 20070610409865,
	author = "S. Rueda and P. Morillo and J.M. Orduna and Duato, Jose",
	abstract = "Distributed virtual environment (DVE) systems have been designed last years as a set of distributed servers. These systems allow a large number of remote users to share a single 3D virtual scene. In order to provide quality of service in a DVE system, clients should be properly assigned to servers taking into account system throughput and system latency. The latter one is composed of both network and computational delays. This highly complex problem is known as the quality of service (QoS) problem. In this paper, we study the implementation of a genetic algorithm (GA) for solving the QoS problem in DVE systems. Performance evaluation results show that, due to its ability of both finding good search paths and keeping diversity, this nature inspired technique can provide significantly better solutions than other heuristic methods while requiring shorter execution times. Therefore, the proposed implementation of GA search method can actually improve the QoS offered by DVE systems. {{\&}}copy; 2006 Elsevier B.V. All rights reserved.",
	address = "P.O. Box 211, Amsterdam, 1000 AE, Netherlands",
	issn = 01403664,
	journal = "Computer Communications",
	key = "Genetic algorithms",
	keywords = "Computational complexity;Distributed computer systems;Quality of service;Servers;Three dimensional computer graphics;Virtual reality;",
	note = "Distributed virtual environments;Performance evaluation;Search methods;",
	number = 4,
	pages = "731 - 739",
	title = "{A} genetic approach for adding {Q}o{S} to distributed virtual environments",
	url = "http://dx.doi.org/10.1016/j.comcom.2006.08.015",
	volume = 30,
	year = 2007
}

Pedro Morillo, Silivia Reuda, Juan M Orduna and Jose Duato. A latency-aware partitioning method for distributed virtual environment systems. IEEE Transactions on Parallel and Distributed Systems 18(9):1215 - 1226, 2007. URL BibTeX

@article{ 20073610796895,
	author = "Pedro Morillo and Silivia Reuda and Juan M. Orduna and Duato, Jose",
	abstract = "Distributed Virtual Environment systems allow multiple users, working on different client computers interconnected through different networks, to interact in a shared virtual world. In these systems, latency is crucial for providing an acceptable quality of service, since it determines how fast client computers are reported about changes in the shared virtual scene produced by other client computers.This paper presents, in a unified manner, a partitioning approach for providing a latency below a threshold to the maximum number of users as possible in Distributed Virtual Environment systems. This partitioning approach searches the assignment of avatars that represents the best trade-off among system latency, system throughput, and partitioning efficiency when solving the partitioning problem. Evaluation results show that the proposed approach not only maximizes system throughput, but it also allows the system to satisfy, if possible, any specific latency requirement needed for providing quality of service. This improvement is achieved without decreasing neither image resolution nor quality of animation, and it can be used together with other techniques already proposed. Therefore, it can contribute to provide quality of service in Distributed Virtual Environments. © 2007 IEEE.",
	address = "445 Hoes Lane - P.O.Box 1331, Piscataway, NJ 08855-1331, United States",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Client server computer systems",
	keywords = "Animation;Image quality;Image resolution;Large scale systems;Quality of service;Virtual reality;",
	note = "Distributed virtual environment system;Partitioning method;",
	number = 9,
	pages = "1215 - 1226",
	title = "{A} latency-aware partitioning method for distributed virtual environment systems",
	url = "http://dx.doi.org/10.1109/TPDS.2007.1055",
	volume = 18,
	year = 2007
}

Ricardo Fernandez-Pascual, Jose M Garcia, Manuel E Acacio and Jose Duato. A low overhead fault tolerant coherence protocol for CMP architectures. 2007, 157 - 168. URL BibTeX

@conference{ 20073210756804,
	author = "Ricardo Fernandez-Pascual and Jose M. Garcia and Manuel E. Acacio and Duato, Jose",
	abstract = "It is widely accepted that transient failures will appear more frequently in chips designed in the near future due to several factors such as the increased integration scale. On the other hand, chip-multiprocessors (CMP) that integrate several processor cores in a single chip are nowadays the best alternative to more efficient use of the increasing number of transistors that can be placed in a single die. Hence, it is necessary to design new techniques to deal with these faults to be able to build sufficiently reliable Chip Multi-processors (CMPs). In this work, we present a coherence protocol aimed at dealing with transient failures that affect the interconnection network of a CMP, thus assuming that the network is no longer reliable. In particular, our proposal extends a token-based cache coherence protocol so that no data can be lost and no deadlock can occur due to any dropped message. Using GEMS full system simulator, we compare our proposal against a similar protocol without fault tolerance (TOKENCMP). We show that in absence of failures our proposal does not introduce overhead in terms of increased execution time over TOKENCMP. Additionally, our protocol can tolerate message loss rates much higher than those likely to be found in the real world without increasing execution time more than 15%. © 2007 IEEE.",
	address = "Scottsdale, AZ, United states",
	issn = 15300897,
	journal = "Proceedings - International Symposium on High-Performance Computer Architecture",
	key = "Network protocols",
	keywords = "Computer architecture;Computer simulation;Fault tolerant computer systems;Microprocessor chips;Multiprocessing systems;Program processors;",
	note = "Chip multiprocessors (CMP);CMP architectures;Fault tolerant coherence protocols;Processor cores;",
	pages = "157 - 168",
	title = "{A} low overhead fault tolerant coherence protocol for {CMP} architectures",
	url = "http://dx.doi.org/10.1109/HPCA.2007.346194",
	year = 2007
}

Alejandro Martinez, Francisco J Alfaro, Jose L Sanchez, Francisco J Quiles and Jose Duato. A new cost-effective technique for QoS support in clusters. IEEE Transactions on Parallel and Distributed Systems 18(12):1714 - 1726, 2007. URL BibTeX

@article{ 20074810948057,
	author = "Alejandro Martinez and Francisco J. Alfaro and Jose L. Sanchez and Francisco J. Quiles and Duato, Jose",
	abstract = "Virtual channels (VCs) are a popular solution for the provision of quality of service (QoS). Current interconnect standards propose 16 or even more VCs for this purpose. However, most implementations do not offer so many VCs because it is too expensive in terms of silicon area. Therefore, a reduction of the number of VCs necessary to support QoS can be very helpful in the switch design and implementation.In this paper, we show that this number of VCs can be reduced if the system is considered as a whole rather than each element being taken separately. The scheduling decisions made at network interfaces can be easily reused at switches without significantly altering the global behavior. In this way, we obtain a noticeable reduction of silicon area, component count, and, thus, power consumption, and we can provide similar performance to a more complex architecture. We also show that this is a scalable technique, suitable for the foreseen demands of traffic. © 2007 IEEE.",
	address = "445 Hoes Lane - P.O.Box 1331, Piscataway, NJ 08855-1331, United States",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Interconnection networks",
	keywords = "Computer architecture;Cost effectiveness;Interfaces;Quality of service;Telecommunication traffic;",
	note = "Scheduling decisions;Switch design;Virtual channels;",
	number = 12,
	pages = "1714 - 1726",
	title = "{A} new cost-effective technique for {Q}o{S} support in clusters",
	url = "http://dx.doi.org/10.1109/TPDS.2007.1108",
	volume = 18,
	year = 2007
}

Blas Cuesta Sáez, Antonio Robles and Jose Duato. An effective starvation avoidance mechanism to enhance the token coherence protocol. In P DAmbra and MR Guarracino (eds.). 15th EUROMICRO International Conference on Parallel, Distributed and Network-Based Processing, Proceedings. 2007, 47-54. BibTeX

@conference{ isi:000245942700007,
author = "Cuesta S{\'a}ez, Blas and Robles, Antonio and Duato, Jose",
abstract = "Shared-memory multiprocessors are becoming to be formed by an increasingly larger number of nodes. In these systems, implementing cache coherence is a key issue. Token Coherence is a low latency cache coherence protocol that avoids indirection for cache-to-cache misses and which does not require a totally-ordered interconnect. When races are rare, the protocol performs well thanks to the performance policy. Unfortunately, some medium/large systems and some applications that often access the same data simultaneously make races more common. As a result, the protocol does not perform as well as it could because it uses the persistent request mechanism to prevent starvation. This mechanism is too slow and inflexible because it overrides the performance policy. In consequence, the protocol slows down the system and does not take advantage of the flexibility and speed of the common case. We propose a new mechanism, namely priority requests, which replaces the persistent request one. Our mechanism solves races, while still respecting the performance policy, simply by ordering and giving a higher priority to requests suffering from starvation. Thus, our mechanism handles the tokens more efficiently and reduces the network traffic.",
booktitle = "15th EUROMICRO International Conference on Parallel, Distributed and Network-Based Processing, Proceedings",
editor = "DAmbra, P and Guarracino, MR",
isbn = 9780769527840,
note = "15th Euromicro International Conference on Parallel, Distributed and Network-Based Processing, Naples, ITALY, FEB 07-09, 2007",
pages = "47-54",
title = "{A}n effective starvation avoidance mechanism to enhance the token coherence protocol",
year = 2007
}

Crispín Gomez, Maria E Gomez, Pedro Lopez and Jose Duato. An efficient fault-tolerant routing methodology for fat-tree interconnection networks*. 2007, 509 - 22. BibTeX

@conference{ 9683889,
	author = "Gomez, Crisp{\'i}n and Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "In large cluster-based machines, fault-tolerance in the interconnection network is an issue of growing importance, since their increasing size rises the probability of failure. The topology used in these machines is usually a fat-tree. This paper proposes a new distributed fault-tolerant routing methodology for fat-trees. It does not require additional network hardware. It is scalable, since the required memory, switch hardware and routing delay do not depend on the net work size. The methodology is based on enhancing the interval routing scheme with exclusion intervals. Exclusion intervals are associated to each switch output port, and represent the set of nodes that are unreachable from this port after a failure appears. We propose a mechanism to identify the exclusion intervals that must be updated after detecting a failure, and the values to write on them. Our methodology is able to support a relatively high number of network failures with a low degradation in network performance.",
	address = "Berlin, Germany",
	journal = "Parallel and Distributed Processing and Applications. Proceedings 5th International Symposium, ISPA 2007. (Lecture Notes in Computer Science vol. 4742)",
	keywords = "failure analysis;fault tolerant computing;multiprocessor interconnection networks;network routing;network topology;probability;trees;",
	note = "distributed fault-tolerant routing methodology;fat-tree interconnection networks;large cluster-based machines;failure probability;interval routing scheme;switch output port;",
	pages = "509 - 22",
	title = "{A}n efficient fault-tolerant routing methodology for fat-tree interconnection networks*",
	year = 2007
}

Francisco J Alfaro, Jose L Sanchez, Manuel Menduina and Jose Duato. {A. IEEE Transactions on Computers (8):1024 - 1039. BibTeX

@article{ 20091912073795,
	author = "Francisco J. Alfaro and Jose L. Sanchez and Manuel Menduina and Duato, Jose",
	abstract = "The InfiniBand architecture (IBA) is an industry-standard architecture for server I/O and interprocessor communication. IBA enables quality-of-service (QoS) support with certain mechanisms. These mechanisms are basically the service levels, the virtual lanes, and the table-based arbitration of those virtual lanes. In previous papers, we have examined these mechanisms and described how we can apply them to the requirements requested by the applications. We have also tested our proposals, showing that the applications achieve the level of QoS requested. In this paper, we present a formal model for the techniques previously proposed. According to this model, each application needs a sequence of entries in the IBA arbitration tables based on its requirements. These requirements are related to the mean bandwidth needed and the maximum latency tolerated by the application. Specifically, each request requires a number of entries with a maximum separation between any consecutive pair. In order to manage the requests, we propose certain algorithms and we prove some propositions and theorems, showing that our method achieves good behavior. © 2007 IEEE.",
	address = "445 Hoes Lane - P.O.Box 1331, Piscataway, NJ 08855-1331, United States",
	issn = 00189340,
	journal = "IEEE Transactions on Computers",
	key = "Queueing networks",
	keywords = "Applications;",
	note = "Formal model;InfiniBand;InfiniBand architectures;Inter-processor communications;QoS;Quality-of-service;Service levels;Standard architectures;",
	number = 8,
	pages = "1024 - 1039",
	title = "{A"
}

, Jose Flich, Jose Duato, Sven-Arne Reinemo and Tor Skeie. Boosting Ethernet Performance by Segment-Based Routing. In Parallel, Distributed and Network-Based Processing, 2007. PDP '07. 15th EUROMICRO International Conference on. 2007, 55 -62. URL, DOI BibTeX

@conference{ 4135259,
	author = ", and Flich, Jose and Duato, Jose and Sven-Arne Reinemo and Tor Skeie",
	abstract = "Ethernet is turning out to be a cost-effective solution for building cluster networks offering compatibility, simplicity, high bandwidth, scalability and a good performance-to-cost ratio. Nevertheless, Ethernet still makes inefficient use of network resources (links) and suffers from long failure recovery time due to the lack of a suitable routing algorithm. In this paper we embed an efficient routing algorithm into 802.3 Ethernet technology, making it possible to use off-the-shelf equipment to build high-performance and cost-effective Ethernet clusters, with an efficient use of link bandwidth and with fault tolerant capabilities. The algorithm, referred to as segment-based routing (SR), is a deterministic routing algorithm that achieves high performance without the need for virtual channels (not available in Ethernet). Moreover, SR is topology agnostic, meaning it can be applied to any topology, and tolerates any combination of faults derived from the original topology when combined with static reconfiguration. Through simulations we verify an overall improvement in throughput by a factor of 1.2 to 10.0 when compared to the conventional Ethernet routing algorithm, the spanning tree protocol (STP), and other topology agnostic routing algorithms such as Up*/Down* and tree-based turn-prohibition, the last one being recently proposed for Ethernet",
	booktitle = "Parallel, Distributed and Network-Based Processing, 2007. PDP '07. 15th EUROMICRO International Conference on",
	doi = "10.1109/PDP.2007.28",
	issn = "1066-6192",
	keywords = "Ethernet technology;Ethernet clusters;cluster networks;fault tolerant capability;off-the-shelf equipment;routing algorithm;segment-based routing;spanning tree protocol;static reconfiguration;topology agnostic routing algorithms;tree-based turn-prohi",
	month = "feb.",
	pages = "55 -62",
	title = "{B}oosting {E}thernet {P}erformance by {S}egment-{B}ased {R}outing",
	url = "http://dx.doi.org/10.1109/PDP.2007.28",
	year = 2007
}

A Martinez, F J Alfaro, J L Sanchez and Jose Duato. Deadline-based QoS algorithms for high-performance networks. 2007, 9 pp. -. BibTeX

@conference{ 9516704,
	author = "A. Martinez and F.J. Alfaro and J.L. Sanchez and Duato, Jose",
	abstract = "Quality of service (QoS) is becoming an attractive feature for high-performance networks and parallel machines because it could allow a more efficient use of resources. Deadline-based algorithms can provide powerful QoS provision. However, the cost associated with keeping ordered lists of packets makes them impractical for high-performance networks. In this paper, we explore how to adapt efficiently the earliest deadline first family of algorithms to the high-speed networks environments. The results show excellent performance using just two virtual channels, FIFO queues, and a cost feasible with today's technology.",
	address = "Piscataway, NJ, USA",
	journal = "2007 IEEE International Parallel and Distributed Processing Symposium (IEEE Cat. No.07TH8938)",
	keywords = "multiprocessor interconnection networks;parallel machines;quality of service;",
	note = "quality of service;QoS;high-performance network;parallel machine;earliest deadline first algorithm;high-speed network;virtual channel;FIFO queue;",
	pages = "9 pp. -",
	title = "{D}eadline-based {Q}o{S} algorithms for high-performance networks",
	year = 2007
}

B Ossa, J A Gil, Julio Sahuquillo and A Pont. Delfos: the Oracle to Predict NextWeb User's Accesses. In Advanced Information Networking and Applications, 2007. AINA '07. 21st International Conference on. May 2007, 679 -686. URL, DOI BibTeX

@conference{ 4220957,
	author = "de la Ossa, B. and J.A. Gil and Sahuquillo, Julio and A. Pont",
	abstract = "Despite the wide and intensive research efforts focused on Web prediction and prefetching techniques aimed to reduce user's perceived latency, few attempts to implement and use them in real environments have been done, mainly due to their complexity and supposed limitations that low user available bandwidths imposed few years ago. Nevertheless, current user bandwidths open a new scenario for prefetching that becomes again an interesting option to improve web performance. This paper presents Delfos, a framework to perform web predictions and prefetching on a real environment that tries to cover the existing gap between research and praxis. Delfos is integrated in the web architecture without modifying the standard HTTP 1.1 protocol, and acts inserting predictions in the web server side, while prefetchs are carried out by the client. In addition, it can be also used as a flexible framework to evaluate and compare existing prefetching techniques and algorithms and to assist in the design of new ones because it provides detailed statistics reports.",
	booktitle = "Advanced Information Networking and Applications, 2007. AINA '07. 21st International Conference on",
	doi = "10.1109/AINA.2007.50",
	isbn = "0-7695-2846-5",
	keywords = "Delfos;Web architecture;Web prediction;Web prefetching;Web server;Web user access;oracle;Web services;authorisation;software architecture;storage management;transport protocols;",
	month = "may",
	pages = "679 -686",
	title = "{D}elfos: the {O}racle to {P}redict {N}ext{W}eb {U}ser's {A}ccesses",
	url = "http://dx.doi.org/10.1109/AINA.2007.50",
	year = 2007
}

Crispín Gomez, Francisco Gilabert, Maria E Gomez, Pedro Lopez and Jose Duato. Deterministic versus Adaptive Routing in Fat-Trees. In Parallel and Distributed Processing Symposium, 2007. IPDPS 2007. IEEE International. March 2007, 1 -8. URL, DOI BibTeX

@conference{ 4228210,
	author = "Gomez, Crisp{\'i}n and Gilabert, Francisco and Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "Clusters of PCs have become very popular to build high performance computers. These machines use commodity PCs linked by a high speed interconnect. Routing is one of the most important design issues of interconnection networks. Adaptive routing usually better balances network traffic, thus allowing the network to obtain a higher throughput. However, adaptive routing introduces out-of-order packet delivery, which is unacceptable for some applications. Concerning topology, most of the commercially available interconnects are based on fat-tree. Fat-trees offer a rich connectivity among nodes, making possible to obtain paths between all source-destination pairs that do not share any link. We exploit this idea to propose a deterministic routing algorithm for fat-trees, comparing it with adaptive routing in several workloads. The results show that deterministic routing can achieve a similar, and in some scenarios higher, level of performance than adaptive routing, while providing in-order packet delivery.",
	booktitle = "Parallel and Distributed Processing Symposium, 2007. IPDPS 2007. IEEE International",
	doi = "10.1109/IPDPS.2007.370482",
	isbn = "1-4244-0910-1",
	keywords = "PC clusters;adaptive routing;deterministic routing algorithm;fat-tree topology;interconnection networks;packet delivery;multistage interconnection networks;telecommunication network routing;telecommunication network topology;telecommunication traffic;tree",
	month = "march",
	pages = "1 -8",
	title = "{D}eterministic versus {A}daptive {R}outing in {F}at-{T}rees",
	url = "http://dx.doi.org/10.1109/IPDPS.2007.370482",
	year = 2007
}

Crispín Gomez, Francisco Gilabert, Maria E Gomez, Pedro Lopez and Jose Duato. Deterministic versus adaptive routing in fat-trees. In Parallel and Distributed Processing Symposium, 2007. IPDPS 2007. IEEE International. 2007, 8 pp. -. URL, DOI BibTeX

@conference{ 9516533,
	author = "Gomez, Crisp{\'i}n and Gilabert, Francisco and Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "Clusters of PCs have become very popular to build high performance computers. These machines use commodity PCs linked by a high speed interconnect. Routing is one of the most important design issues of interconnection networks. Adaptive routing usually better balances network traffic, thus allowing the network to obtain a higher throughput. However, adaptive routing introduces out-of-order packet delivery, which is unacceptable for some applications. Concerning topology, most of the commercially available interconnects are based on fat-tree. Fat-trees offer a rich connectivity among nodes, making possible to obtain paths between all source-destination pairs that do not share any link. We exploit this idea to propose a deterministic routing algorithm for fat-trees, comparing it with adaptive routing in several workloads. The results show that deterministic routing can achieve a similar, and in some scenarios higher, level of performance than adaptive routing, while providing in-order packet delivery.",
	booktitle = "Parallel and Distributed Processing Symposium, 2007. IPDPS 2007. IEEE International",
	doi = "10.1109/IPDPS.2007.370482",
	isbn = "1-4244-0910-1",
	journal = "2007 IEEE International Parallel and Distributed Processing Symposium (IEEE Cat. No.07TH8938)",
	keywords = "multistage interconnection networks;telecommunication network routing;telecommunication network topology;telecommunication traffic;trees;",
	month = "Mar.",
	note = "adaptive routing;fat-tree topology;PC clusters;interconnection networks;packet delivery;deterministic routing algorithm;",
	pages = "8 pp. -",
	publisher = "IEEE Computer Society",
	title = "{D}eterministic versus adaptive routing in fat-trees",
	url = "http://dx.doi.org/10.1109/IPDPS.2007.370482",
	year = 2007
}

Alejandro Martinez, Francisco J Alfaro, Jose L Sanchez and Jose Duato. Efficient switches with QoS support for clusters. 2007, IEEE Computer Societ. URL BibTeX

@conference{ 20073910825291,
	author = "Alejandro Martinez and Francisco J. Alfaro and Jose L. Sanchez and Duato, Jose",
	abstract = "Current interconnect standards providing hardware support for quality of service (QoS) consider up to 16 virtual channels (VCs) for this purpose. However, most implementations do not offer so many VCs because they increase the complexity of the switch and the scheduling delays. We have shown that this number of VCs can be significantly reduced, because it is enough to use two VCs for QoS purposes at each switch port. In this paper, we cover the weaknesses of that proposal and, not only we reduce VCs, but we also improve performance due to the flexibility assigning buffer memory. {{\&}}copy; 2007 IEEE.",
	address = "Long Beach, CA, United states",
	journal = "Proceedings - 21st International Parallel and Distributed Processing Symposium, IPDPS 2007; Abstracts and CD-ROM",
	key = "Switching systems",
	keywords = "Buffer storage;Communication channels (information theory);Delay circuits;Interconnection networks;Quality of service;Scheduling;Standards;Virtual reality;",
	note = "Buffer memory;Interconnect standards;Virtual channels (VC);",
	pages = "IEEE Computer Societ",
	title = "{E}fficient switches with {Q}o{S} support for clusters",
	url = "http://dx.doi.org/10.1109/IPDPS.2007.370473",
	year = 2007
}

Eun Jung Kim, Ki Hwan Yum, Chita R Das, Mazin Yousif and Jose Duato. Exploring IBA design space for improved performance. IEEE Transactions on Parallel and Distributed Systems 18(4):498 - 510, 2007. URL BibTeX

@article{ 20071510536730,
	author = "Eun Jung Kim and Ki Hwan Yum and Chita R. Das and Mazin Yousif and Duato, Jose",
	abstract = "InfiniBand Architecture (IBA) is envisioned to be the default communication fabric for future system area networks (SANs) or clusters. However, IBA design is currently in its infancy since the released specification outlines only higher level functionalities, leaving it open for exploring various design alternatives. In this paper, we investigate four corelated techniques for providing high and predictable performance in IBA. These are: 1) using the Shortest Path First (SPF) algorithm for deterministic packet routing, 2) developing a multipath routing mechanism for minimizing congestion, 3) developing a selective packet dropping scheme to handle deadlock and congestion, and 4) providing multicasting support for customized applications. These designs are implemented in a pipelined, IBA-style switch architecture, and are evaluated using an integrated workload consisting of MPEG-2 video streams, best-effort traffic, and control traffic on a versatile IBA simulation testbed. Simulation results with 15-node and 30-node irregular networks indicate that the SPF routing, multipath routing, packet dropping, and multicasting schemes are quite effective in delivering high and assured performance in clusters. {{\&}}copy; 2007 IEEE.",
	address = "445 Hoes Lane - P.O.Box 1331, Piscataway, NJ 08855-1331, United States",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Network architecture",
	keywords = "Computer simulation;Congestion control (communication);Multicasting;Network routing;Packet networks;Quality of service;Video streaming;",
	note = "InfiniBand architecture;Packet dropping;System area networks;",
	number = 4,
	pages = "498 - 510",
	title = "{E}xploring {IBA} design space for improved performance",
	url = "http://dx.doi.org/10.1109/TPDS.2007.1010",
	volume = 18,
	year = 2007
}

Aurelio Bermudez, Rafael Casado, Francisco J Quiles and Jose Duato. Handling topology changes in InfiniBand. IEEE Transactions on Parallel and Distributed Systems 18(2):172 - 185, 2007. URL BibTeX

@article{ 20070710422540,
	author = "Aurelio Bermudez and Rafael Casado and Francisco J. Quiles and Duato, Jose",
	abstract = "InfiniBand is a high-performance switched network. Its topology may change due to devices being turned on/off, hot expansion, link remapping, and component failures. The InfiniBand specification defines a management infrastructure which is responsible for detecting and assimilating any change in the network. When a change occurs, management entities must update switch forwarding tables, in order to maintain the connectivity among end nodes. This implies the acquisition of the current topology and the computation of a new set of routes accordingly. It is desirable that the execution of this process does not affect the performance of the upper-level applications that are using the network. In previous works, we have proposed enhanced implementations for the main tasks involved in the assimilation of a change. Now, we present a detailed performance evaluation of a management mechanism which incorporates all our proposals. {{\&}}copy; 2007 IEEE.",
	address = "445 Hoes Lane - P.O.Box 1331, Piscataway, NJ 08855-1331, United States",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Switching networks",
	keywords = "Local area networks;Management;Network protocols;Routers;Topology;",
	note = "InfiniBand;Network management;",
	number = 2,
	pages = "172 - 185",
	title = "{H}andling topology changes in {I}nfini{B}and",
	url = "http://dx.doi.org/10.1109/TPDS.2007.26",
	volume = 18,
	year = 2007
}

Blas Cuesta Sáez, Antonio Robles and Jose Duato. Improving token coherence by Multicast Coherence Messages. In D ElBaz, J Bourgeois and F Spies (eds.). PROCEEDINGS OF THE 16TH EUROMICRO CONFERENCE ON PARALLEL, DISTRIBUTED AND NETWORK-BASED PROCESSING. 2007, 269-273. BibTeX

@conference{ isi:000254266500036,
	author = "Cuesta S{\'a}ez, Blas and Robles, Antonio and Duato, Jose",
	abstract = "Token Coherence is a cache coherence protocol that joins the main advantages of traditional protocols. However, unlike them, Token Coherence does not handle messages in order, which may lead to races, causing some cache misses not to be solved To assure their completion, an inefficient mechanism named persistent requests is used Recently we have proposed the priority request mechanism to efficiently handle races. As acknowledgements are not required, a single node can solve several misses for the same memory block at the same time. When solving a lot of misses, the node may become a bottleneck. To avoid it, in this work we propose the Multicast Coherence Message, which allows to simultaneously resolve several misses by using only one response message. It reduces the network traffic and the average response latency, improving significantly the overall performance.",
	booktitle = "PROCEEDINGS OF THE 16TH EUROMICRO CONFERENCE ON PARALLEL, DISTRIBUTED AND NETWORK-BASED PROCESSING",
	editor = "ElBaz, D and Bourgeois, J and Spies, F",
	isbn = 9780769530895,
	issn = "1066-6192",
	note = "16th Euromicro International Conference on Parallel, Distributed and Network-Based Processing, Toulouse, FRANCE, FEB 13-15, 2008",
	pages = "269-273",
	series = "Euromicro Workshop on Parallel and Distributed Processing",
	title = "{I}mproving token coherence by {M}ulticast {C}oherence {M}essages",
	year = 2007
}

B Ossa, J A Gil, Julio Sahuquillo and A Pont. Improving Web Prefetching by Making Predictions at Prefetch. In Next Generation Internet Networks, 3rd EuroNGI Conference on. May 2007, 21 -27. URL, DOI BibTeX

@conference{ 4231816,
	author = "de la Ossa, B. and J.A. Gil and Sahuquillo, Julio and A. Pont",
	abstract = "Most of the research attempts to improve Web prefetching techniques have focused on the prediction algorithm with the objective of increasing its precision or, in the best case, to reduce the user's perceived latency. In contrast, to improve prefetching performance, this work concentrates in the prefetching engine and proposes the Prediction at Prefetch (P@P) technique. This paper explains how a prefetching technique can be extended to include our P@P proposal on real world conditions without changes in the web architecture or HTTP protocol. To show how this proposal can improve prefetching performance an extensive performance evaluation study has been done and the results show that P@P can considerably reduce the user's perceived latency with no additional cost over the basic prefetch mechanism.",
	booktitle = "Next Generation Internet Networks, 3rd EuroNGI Conference on",
	doi = "10.1109/NGI.2007.371193",
	isbn = "1-4244-0857-1",
	keywords = "HTTP protocol;Web browser;Web prefetching techniques;Web server;prediction algorithm;prediction at prefetch technique;user perceived latency;Internet;information retrieval;",
	month = "may",
	pages = "21 -27",
	title = "{I}mproving {W}eb {P}refetching by {M}aking {P}redictions at {P}refetch",
	url = "http://dx.doi.org/10.1109/NGI.2007.371193",
	year = 2007
}

A Martinez-Vicente, P J Garcia, F J Alfaro, J L Sanchez, Jose Flich, F J Quiles and Jose Duato. Integrated QoS provision and congestion management for interconnection networks. In Euro-Par 2007. Parallel Processing. Proceedings 13th International Euro-Par Conference. LNCS 4641. 2007, 837 - 47. BibTeX

@conference{ 9689023,
	author = "A. Martinez-Vicente and P.J. Garcia and F.J. Alfaro and J.L. Sanchez and Flich, Jose and F.J. Quiles and Duato, Jose",
	abstract = "Both QoS support and congestion management techniques have become essential for achieving good performance in current highspeed interconnection networks. However, traditional techniques proposed for both issues require too many resources for being implemented. In this paper we propose a new switch architecture that efficiently uses the same resources to offer both congestion management and QoS provision. It is as effective as previous proposals, but much more cost-effective.",
	address = "Berlin, Germany",
	booktitle = "Euro-Par 2007. Parallel Processing. Proceedings 13th International Euro-Par Conference.",
	journal = "Euro-Par 2007. Parallel Processing. Proceedings 13th International Euro-Par Conference. (Lecture Notes in Computer Science vol. 4641)",
	keywords = "computer network management;multistage interconnection networks;quality of service;queueing theory;telecommunication congestion control;",
	note = "switch architecture;interconnection network;quality of service;QoS support;congestion management technique;",
	pages = "837 - 47",
	title = "{I}ntegrated {Q}o{S} provision and congestion management for interconnection networks",
	volume = "LNCS 4641",
	year = 2007
}

Rafael Ubal, Julio Sahuquillo, Salvador Petit, H Hassan and Pedro Lopez. Leakage Current Reduction in Data Caches on Embedded Systems. In Intelligent Pervasive Computing, 2007. IPC. The 2007 International Conference on. 2007, 45 -50. URL, DOI BibTeX

@conference{ 4438392,
	author = "Ubal, Rafael and Sahuquillo, Julio and Petit, Salvador and H. Hassan and Lopez, Pedro",
	abstract = "Nowadays, embedded systems can be found in a wide range of pervasive devices (e.g., smart phones, PDAs, or video/digital cameras). These devices contain large cache memories, whose power consumption can reach about 50% of the total spent energy, from which leakage energy is the predominant fraction in current technologies. This paper proposes a technique to reduce leakage energy consumption in data caches on embedded systems, which is based on the fact that most stored bits take a logical value of zero. The proposed technique has been evaluated on a model of a contemporary high-end embedded microprocessor, namely the ARM Cortex A8 processor, executing a set of standard embedded benchmarks. Experimental results show that leakage energy savings reach about 40% with no IPC loss.",
	booktitle = "Intelligent Pervasive Computing, 2007. IPC. The 2007 International Conference on",
	doi = "10.1109/IPC.2007.95",
	keywords = "ARM Cortex A8 processor;cache memories;data caches;high-end embedded microprocessor;leakage energy consumption reduction;pervasive devices;cache storage;microprocessor chips;power consumption;ubiquitous computing;",
	month = "oct.",
	pages = "45 -50",
	title = "{L}eakage {C}urrent {R}eduction in {D}ata {C}aches on {E}mbedded {S}ystems",
	url = "http://dx.doi.org/10.1109/IPC.2007.95",
	year = 2007
}

Rafael Ubal, Julio Sahuquillo, Salvador Petit and Pedro Lopez. Multi2Sim: A Simulation Framework to Evaluate Multicore-Multithreaded Processors. In Computer Architecture and High Performance Computing, 2007. SBAC-PAD 2007. 19th International Symposium on. 2007, 62 -68. URL, DOI BibTeX

@conference{ 4384043,
	author = "Ubal, Rafael and Sahuquillo, Julio and Petit, Salvador and Lopez, Pedro",
	abstract = "Current microprocessors are based in complex designs, integrating different components on a single chip, such as hardware threads, processor cores, memory hierarchy or interconnection networks. The permanent need of evaluating new designs on each of these components motivates the development of tools which simulate the system working as a whole. In this paper, we present the Multi2Sim simulation framework, which models the major components of incoming systems, and is intended to cover the limitations of existing simulators. A set of simulation examples is also included for illustrative purposes.",
	booktitle = "Computer Architecture and High Performance Computing, 2007. SBAC-PAD 2007. 19th International Symposium on",
	doi = "10.1109/SBAC-PAD.2007.17",
	issn = "1550-6533",
	keywords = "Multi2Sim;hardware threads;interconnection networks;memory hierarchy;microprocessors;multicore-multithreaded processors;processor cores;multi-threading;multiprocessor interconnection networks;",
	month = "oct.",
	pages = "62 -68",
	title = "{M}ulti2{S}im: {A} {S}imulation {F}ramework to {E}valuate {M}ulticore-{M}ultithreaded {P}rocessors",
	url = "http://dx.doi.org/10.1109/SBAC-PAD.2007.17",
	year = 2007
}

S Rueda, P Morillo, J M Orduna and Jose Duato. On the characterization of peer-to-peer distributed virtual environments. 2007, 107 - 114. URL BibTeX

@conference{ 20073210756837,
	author = "S. Rueda and P. Morillo and J.M. Orduna and Duato, Jose",
	abstract = "Large scale distributed virtual environments (DVEs) have become a major trend in distributed applications, mainly due to the enormous popularity of multi-player online games in the entertainment industry. Since architectures based on networked servers seem to be not scalable enough to support massively multi-player applications, peer-to-peer (P2P) architectures have been proposed as an efficient and truly scalable solution for this kind of systems. However, in order to design efficient DVEs based on peer-to-peer architectures these systems must be characterized, measuring the impact of different client behaviors on system performance. This paper presents the experimental characterization of peer-to-peer distributed virtual environments in regard to well-known performance metrics in distributed systems. Characterization results show that system saturation is inherently avoided due to the peerto-peer scheme, as it could be expected. Also, these results show that the saturation of a given client exclusively has an effect on the surrounding clients in the virtual world, having no noticeable effect at all on the rest of avatars. Finally, the characterization results show that the response time offered to client computers greatly depends on the number of new connections that these clients have to make when new neighbors appear in the virtual world. These results can be used as the basis for an efficient design of peer-to-peer DVE systems. {{\&}}copy; 2007 IEEE.",
	address = "Charlotte, NC, United states",
	journal = "Proceedings - IEEE Virtual Reality",
	key = "Virtual reality",
	keywords = "Computer architecture;Distributed computer systems;Interactive computer graphics;Online systems;Servers;",
	note = "Distributed virtual environments;Entertainment industry;Multiplayer online games;Peer-to-peer architectures;",
	pages = "107 - 114",
	title = "{O}n the characterization of peer-to-peer distributed virtual environments",
	url = "http://dx.doi.org/10.1109/VR.2007.352470",
	year = 2007
}

Marina Alonso, Salvador Coll, Vicente Santonja, Juan Miguel Martínez, Pedro Lopez and Jose Duato. Power-aware fat-tree networks using on/off links. In R Perrott, BM Chapman, J Subhlok, RF DeMello and LT Yang (eds.). HIGH PERFORMANCE COMPUTING AND COMMUNICATIONS, PROCEEDINGS 4782. 2007, 472-483. BibTeX

@conference{ isi:000250940200040,
	author = "Alonso, Marina and Coll, Salvador and Santonja, Vicente and Mart{\'i}nez, Juan Miguel and Lopez, Pedro and Duato, Jose",
	abstract = "Nowadays, power consumption reduction techniques are being increasingly used in computer systems, and high-performance computing systems are not an exception. In particular, the power consumed by the interconnect circuitry has a non-negligible contribution to the total system budget. In this scenario, fat-tree interconnection networks are one of the most popular topologies. This topology is particularly well-suited for applying power consumption reduction techniques since it provides multiple alternative paths for each source/destination pair. In this paper, we present a mechanism that dynamically adjusts the available network bandwidth by switching links on and off, according to the traffic requirements. This mechanism provides significant reduction in power consumption while maintaining the original underlying routing algorithm, at the expense of slight latency increase for low loads.",
	booktitle = "HIGH PERFORMANCE COMPUTING AND COMMUNICATIONS, PROCEEDINGS",
	editor = "Perrott, R and Chapman, BM and Subhlok, J and DeMello, RF and Yang, LT",
	isbn = "978-3-540-75443-5",
	issn = "0302-9743",
	note = "3rd International Conference on High Performance Computing and Communications (HPCC 2007), Houston, TX, SEP 26-28, 2007",
	pages = "472-483",
	series = "LECTURE NOTES IN COMPUTER SCIENCE",
	title = "{P}ower-aware fat-tree networks using on/off links",
	volume = 4782,
	year = 2007
}

A Martinez, F J Alfaro, J L Sanchez and Jose Duato. Providing full QoS with 2 VCs in high-speed switches. 2007, 345 - 54. BibTeX

@conference{ 10418557,
	author = "A. Martinez and F.J. Alfaro and J.L. Sanchez and Duato, Jose",
	abstract = "Current interconnect standards propose 16 or even more virtual channels (VCs) for provision of quality of service (QoS). However, VCs increase the complexity of the switch and the scheduling delays. In a previous paper, we have shown how to use only two VCs for full QoS support at the switches. In this paper, we explore thoroughly two alternative switch designs that take advantage of this reduction. We analyze their feasibility in a single chip implementation and show that they get a noticeable performance while greatly reducing the cost and power consumption of the network.",
	address = "Berlin, Germany",
	journal = "Information Networking. Towards Ubiquitous Networking and Services. International Conference, ICOIN 2007",
	keywords = "quality of service;scheduling;switches;telecommunication switching;",
	note = "QoS;high-speed switches;current interconnect standard;virtual channel;quality of service;scheduling delay;power consumption;",
	pages = "345 - 54",
	title = "{P}roviding full {Q}o{S} with 2 {VC}s in high-speed switches",
	year = 2007
}

Gaspar Mora, P J Garcia, Jose Flich and Jose Duato. RECN-IQ: A Cost-Effective Input-Queued Switch Architecture with Congestion Management. In Parallel Processing, 2007. ICPP 2007. International Conference on. 2007, 74 -74. URL, DOI BibTeX

@conference{ 4343881,
	author = "Mora, Gaspar and P.J. Garcia and Flich, Jose and Duato, Jose",
	abstract = "As the number of computing and storage nodes keeps increasing, the interconnection network is becoming a key element of many computing and communication systems, where the overall performance directly depends on network performance. This performance may dramatically drop during congestion situations. Although congestion may be avoided by over dimensioning the network, the current trend is to reduce overall cost and power consumption by reducing the number of network components. Thus, the network will be prone to congestion, thereby becoming mandatory the use of congestion management techniques. In that sense, the technique known as Regional Explicit Congestion Notification (RECN) completely eliminates the Head-of-Line (HOL) blocking produced by congested packets, turning congestion harmless. However, RECN has been designed for switches with queues at input and output ports (CIOQ switches), thus it can not be directly applied to other types of switches. Additionally, the method RECN uses for detecting congestion requires several detection queues that increase the memory requirements and thus switch cost. Thus, we completely redefine the RECN mechanism in order to achieve different goals. First, we adapt RECN to a switch organization with queues only at input ports (IQ switches). These switches are simpler and cheaper to produce than CIOQ ones. Second, we propose a new method for detecting congestion that does not require several detection queues, thereby reducing RECN memory requirements. These improvements lead to achieve a cost-effective switch organization that derive maximum performance even in the presence of congestion. Also, we present in detail a realistic switch architecture supporting the new mechanism. Results demonstrate that the new RECN version in an IQ switch achieves maximum network performance in all the analyzed situations. These results have been a reduction factor of data memory requirements of 5 with respect to the previous RECN mechanism in CIOQ- - switches.",
	booktitle = "Parallel Processing, 2007. ICPP 2007. International Conference on",
	doi = "10.1109/ICPP.2007.71",
	issn = "0190-3918",
	keywords = "RECN-IQ memory requirement;cost-effective input-queued switch architecture;head-of-line blocking;interconnection network;packet congestion management technique;power consumption;regional explicit congestion notification;computer architecture;multiprocesso",
	month = "10-14",
	pages = "74 -74",
	title = "{RECN}-{IQ}: {A} {C}ost-{E}ffective {I}nput-{Q}ueued {S}witch {A}rchitecture with {C}ongestion {M}anagement",
	url = "http://dx.doi.org/10.1109/ICPP.2007.71",
	year = 2007
}

Jose Flich, , Pedro Lopez and Jose Duato. Region-Based Routing: An Efficient Routing Mechanism to Tackle Unreliable Hardware in Network on Chips. In Networks-on-Chip, 2007. NOCS 2007. First International Symposium on. 2007, 183 -194. URL, DOI BibTeX

@conference{ 4209007,
	author = "Flich, Jose and , and Lopez, Pedro and Duato, Jose",
	abstract = "The design of scalable and reliable interconnection networks for system on chips (SoCs) introduce new design constraints not present in current multicomputer systems. Although regular topologies are preferred for building NoCs, heterogeneous blocks, fabrication faults and reliability issues derived from the high integration scale may lead to irregular topologies. In this situation, efficient routing becomes a challenge. Although table-based routing allows the use of most routing algorithms on any topology, it does not scale in terms of latency and area. In this paper we propose the region-based routing mechanism that avoids the scalability problems of table-based solutions. From an initial topology and routing algorithm, the mechanism groups, at every switch, destinations into different regions based on the output ports. By doing this, redundant routing information typically found in routing tables is eliminated. Evaluation results show that the mechanism requires only four regions to support several routing algorithms in a 2D mesh with no performance degradation. Moreover, when dealing with link failures, our results indicate that the mechanism combined with the segment-based routing algorithm is able to pack all the routing information into eight regions providing high throughput. The paper provides also a simple and efficient hardware implementation of the mechanism requiring only 240 logic gates per switch to support eight regions in a 2D mesh topology",
	booktitle = "Networks-on-Chip, 2007. NOCS 2007. First International Symposium on",
	doi = "10.1109/NOCS.2007.39",
	keywords = "2D mesh topology;interconnection networks;multicomputer systems;network on chips;region-based routing;segment-based routing algorithm;system on chips;table-based routing;integrated circuit interconnections;logic design;microprocessor chips;network routing",
	month = "7-9",
	pages = "183 -194",
	title = "{R}egion-{B}ased {R}outing: {A}n {E}fficient {R}outing {M}echanism to {T}ackle {U}nreliable {H}ardware in {N}etwork on {C}hips",
	url = "http://dx.doi.org/10.1109/NOCS.2007.39",
	year = 2007
}

Julio Sahuquillo, N Tomas, Salvador Petit and A Pont. Spim-Cache: A Pedagogical Tool for Teaching Cache Memories Through Code-Based Exercises. Education, IEEE Transactions on 50(3):244 -250, 2007. URL, DOI BibTeX

@article{ 4287124,
	author = "Sahuquillo, Julio and N. Tomas and Petit, Salvador and A. Pont",
	abstract = "Cache memories represent a core topic in all computer organization and architecture courses offered at universities around the world. As a consequence, educational proposals and textbooks address important efforts to this topic. A valuable pedagogical help when studying cache memories is to perform exercises based on simple algorithms, which allow the identification of cache accesses, for instance, a program accessing the elements of an array. These exercises, referred to as code-based exercises, have a good acceptance among instructors of computer organization courses. Nevertheless, no tool (e.g., simulator) has been developed to be used in undergraduate courses working with this kind of exercises; therefore, students perform such exercises by means of the classic paper and pencil methodology. To fill this gap, this paper proposes a new pedagogical tool, namely Spim-cache. A laboratory example is also presented for illustrative purposes.",
	doi = "10.1109/TE.2007.900021",
	issn = "0018-9359",
	journal = "Education, IEEE Transactions on",
	keywords = "Spim-cache;cache memories;code-based exercises;computer architecture courses;computer organization courses;pedagogical tool;undergraduate courses;cache storage;computer aided instruction;computer science education;educational courses;",
	month = "aug.",
	number = 3,
	pages = "244 -250",
	title = "{S}pim-{C}ache: {A} {P}edagogical {T}ool for {T}eaching {C}ache {M}emories {T}hrough {C}ode-{B}ased {E}xercises",
	url = "http://dx.doi.org/10.1109/TE.2007.900021",
	volume = 50,
	year = 2007
}

Hilario Lopez, Antonio Robles, Ivan Machon, Eva Fernandez and Luis Fernando Sancho. Temperature monitoring system in the mould of a slab continuous casting line. In 2007 IEEE INTERNATIONAL SYMPOSIUM ON INDUSTRIAL ELECTRONICS, PROCEEDINGS, VOLS 1-8. 2007, 175-179. BibTeX

@conference{ isi:000252265100032,
	author = "Hilario Lopez and Robles, Antonio and Ivan Machon and Eva Fernandez and Luis Fernando Sancho",
	abstract = "In this article a study is introduced that was carried out for the implementation of a temperature monitoring system in the mould of a slab continuous casting line in ACERALIA's LD3 steel factory in Aviles (Asturias). To achieve this, instrumentation has been proposed consisting on precision thermocouples placed along the vertical mid-line of both the broad face and the narrow face of the mould. Signals are converted and sent through an industrial bus to the acquisition station. Here, the data from the process computer (the conditions under which the casting develops) is also stored. The ultimate objective is the retrieval of actual data on temperatures at specific locations of the mould. These data can be used for the adjustment of models of mould operative behaviour.",
	booktitle = "2007 IEEE INTERNATIONAL SYMPOSIUM ON INDUSTRIAL ELECTRONICS, PROCEEDINGS, VOLS 1-8",
	isbn = "978-1-4244-0754-5",
	note = "IEEE International Symposium on Industrial Electronics, Vigo, SPAIN, JUN 04-07, 2007",
	pages = "175-179",
	title = "{T}emperature monitoring system in the mould of a slab continuous casting line",
	year = 2007
}

Rafael Ubal, Julio Sahuquillo, Salvador Petit, Pedro Lopez and Jose Duato. VB-MT: Design Issues and Performance of the Validation Buffer Microarchitecture for Multithreaded Processors. In Parallel Architecture and Compilation Techniques, 2007. PACT 2007. 16th International Conference on. 2007, 429 -429. URL, DOI BibTeX

@conference{ 4336257,
	author = "Ubal, Rafael and Sahuquillo, Julio and Petit, Salvador and Lopez, Pedro and Duato, Jose",
	abstract = "The validation buffer (VB) Microarchitecture retires instructions out of order, by substituting the classical ROB by the VB structure. The VB removes the negative effect of long latency instructions located at the ROB head, which prevent other instructions from retiring and cause frequent pipeline stalls due to lack of space in the ROB. This work analyzes different multithreading models (coarse grain, fine grain and simultaneous multithreading) and a set of different instruction fetch policies.",
	booktitle = "Parallel Architecture and Compilation Techniques, 2007. PACT 2007. 16th International Conference on",
	doi = "10.1109/PACT.2007.4336257",
	issn = "1089-795X",
	keywords = "ROB head;VB structure;instruction fetch policies;multithreaded processors;validation buffer microarchitecture;buffer storage;multi-threading;parallel architectures;storage allocation;",
	month = "sept.",
	pages = "429 -429",
	title = "{VB}-{MT}: {D}esign {I}ssues and {P}erformance of the {V}alidation {B}uffer {M}icroarchitecture for {M}ultithreaded {P}rocessors",
	url = "http://dx.doi.org/10.1109/PACT.2007.4336257",
	year = 2007
}

Maria E Gomez, N A Nordbotten, Jose Flich, Pedro Lopez, Antonio Robles, Jose Duato, T Skeie and O Lysne. A routing methodology for achieving fault tolerance in direct networks. Computers, IEEE Transactions on 55(4):400 - 415, April 2006. URL, DOI BibTeX

@article{ 1608003,
author = "Gomez, Maria E. and N.A. Nordbotten and Flich, Jose and Lopez, Pedro and Robles, Antonio and Duato, Jose and T. Skeie and O. Lysne",
abstract = "Massively parallel computing systems are being built with thousands of nodes. The nterconnection network plays a key role for the performance of such systems. However, the high number of components significantly increases the probability of failure. Additionally, failures in the interconnection network may isolate a large fraction of the machine. It is therefore critical to provide an efficient fault-tolerant mechanism to keep the system running, even in the presence of faults. This paper presents a new fault-tolerant routing methodology that does not degrade performance in the absence of faults and tolerates a reasonably large number of faults without disabling any healthy node. In order to avoid faults, for some source-destination pairs, packets are first sent to an intermediate node and then from this node to the destination node. Fully adaptive routing is used along both subpaths. The methodology assumes a static fault model and the use of a checkpoint/restart mechanism. However, there are scenarios where the faults cannot be avoided solely by using an intermediate node. Thus, we also provide some extensions to the methodology. Specifically, we propose disabling adaptive routing and/or using misrouting on a per-packet basis. We also propose the use of more than one intermediate node for some paths. The proposed fault-tolerant routing methodology is extensively evaluated in terms of fault tolerance, complexity, and performance.",
doi = "10.1109/TC.2006.46",
issn = "0018-9340",
journal = "Computers, IEEE Transactions on",
keywords = "adaptive routing; checkpoint-restart mechanism; direct networks; fault-tolerant routing methodology; interconnection network; parallel computing system; fault tolerant computing; multiprocessor interconnection networks; network routing; parallel processi",
month = "april",
number = 4,
pages = "400 - 415",
title = "{A} routing methodology for achieving fault tolerance in direct networks",
url = "http://dx.doi.org/10.1109/TC.2006.46",
volume = 55,
year = 2006
}

Maria E Gomez, N A Nordbotten, Jose Flich, Pedro Lopez, Antonio Robles, Jose Duato, T Skeie and O Lysne. A routing methodology for achieving fault tolerance in direct networks. IEEE Transactions on Computers 55(4):400 - 15, 2006. URL, DOI BibTeX

@article{ 8935111,
author = "Gomez, Maria E. and N.A. Nordbotten and Flich, Jose and Lopez, Pedro and Robles, Antonio and Duato, Jose and T. Skeie and O. Lysne",
abstract = "Massively parallel computing systems are being built with thousands of nodes. The interconnection network plays a key role for the performance of such systems. However, the high number of components significantly increases the probability of failure. Additionally, failures in the interconnection network may isolate a large fraction of the machine. It is therefore critical to provide an efficient fault-tolerant mechanism to keep the system running, even in the presence of faults. This paper presents a new fault-tolerant routing methodology that does not degrade performance in the absence of faults and tolerates a reasonably large number of faults without disabling any healthy node. In order to avoid faults, for some source-destination pairs, packets are first sent to an intermediate node and then from this node to the destination node. Fully adaptive routing is used along both subpaths. The methodology assumes a static fault model and the use of a checkpoint/restart mechanism. However, there are scenarios where the faults cannot be avoided solely by using an intermediate node. Thus, we also provide some extensions to the methodology. Specifically, we propose disabling adaptive routing and/or using misrouting on a per-packet basis. We also propose the use of more than one intermediate node for some paths. The proposed fault-tolerant routing methodology is extensively evaluated in terms of fault tolerance, complexity, and performance",
address = "USA",
doi = "10.1109/TC.2006.46",
issn = "0018-9340",
journal = "IEEE Transactions on Computers",
keywords = "fault tolerant computing;multiprocessor interconnection networks;network routing;parallel processing;",
note = "direct networks;parallel computing system;interconnection network;fault-tolerant routing methodology;adaptive routing;checkpoint-restart mechanism;",
number = 4,
pages = "400 - 15",
title = "{A} routing methodology for achieving fault tolerance in direct networks",
url = "http://dx.doi.org/10.1109/TC.2006.46",
volume = 55,
year = 2006
}

P Morillo, J M Orduna and Jose Duato. A scalable synchronization technique for distributed virtual environments based on networked-server architectures. 2006, 74 - 81. URL BibTeX

@conference{ 20073110720815,
	author = "P. Morillo and J.M. Orduna and Duato, Jose",
	abstract = "In recent years, large scale distributed virtual environments have become a major trend in distributed applications, mainly due to the enormous popularity of multiplayer online games in the entertainment industry. Thus, scalability has become an essential issue for these highly interactive systems. In this paper, we propose a new synchronization technique for those distributed virtual environments that are based on networked-server architectures. Unlike other methods described in the literature, the proposed technique takes into account the updating messages exchanged by avatars, thus releasing the servers from updating the location of such avatars when synchronizing the state of the system. As a result, the communications required for synchronization are greatly reduced, and this method results more scalable. Also, these communications are distributed along the whole synchronization period, in order to reduce workload peaks. Performance evaluation results show that the proposed approach significantly reduces the percentage of CPU utilization in the servers when compared with other existing methods, therefore supporting a higher number of avatars. Additionally, the system response time is reduced accordingly. {{\&}}copy; 2006 IEEE.",
	address = "Columbus, OH, United states",
	issn = 15302016,
	journal = "Proceedings of the International Conference on Parallel Processing Workshops",
	key = "Distributed computer systems",
	keywords = "Communication systems;Computer architecture;Data processing;Interactive computer systems;Program processors;Servers;Virtual reality;",
	note = "Distributed applications;Entertainment industry;Synchronization technique;Virtual environments;",
	pages = "74 - 81",
	title = "{A} scalable synchronization technique for distributed virtual environments based on networked-server architectures",
	url = "http://dx.doi.org/10.1109/ICPPW.2006.16",
	year = 2006
}

P Morillo, J M Ordufia and Jose Duato. A scalable synchronization technique for distributed virtual environments based on networked-server architectures. 2006, 8 pp. -. BibTeX

@conference{ 9089294,
author = "P. Morillo and J.M. Ordufia and Duato, Jose",
abstract = "Large scale distributed virtual environments have become a major trend in distributed applications, mainly due to the enormous popularity of multi-player online games in the entertainment industry. Thus, scalability has become an essential issue for these highly interactive systems. In this paper, we propose a new synchronization technique for those distributed virtual environments that are based on networked-server architectures. Unlike other methods described in the literature, the proposed technique takes into account the updating messages exchanged by avatars, thus releasing the servers from updating the location of such avatars when synchronizing the state of the system. As a result, the communications required for synchronization are greatly reduced, and this method results more scalable. Also, these communications are distributed along the whole synchronization period, in order to reduce workload peaks. Performance evaluation results show that the proposed approach significantly reduces the percentage of CPU utilization in the servers when compared with other existing methods, therefore supporting a higher number of avatars. Additionally, the system response time is reduced accordingly",
address = "Los Alamitos, CA, USA",
journal = "2006 International Conference on Parallel Processing Workshops",
keywords = "avatars;distributed processing;interactive systems;network servers;performance evaluation;resource allocation;",
note = "scalable synchronization technique;distributed virtual environments;networked-server architectures;multiplayer online games;entertainment industry;avatars;CPU utilization;interactive systems;",
pages = "8 pp. -",
title = "{A} scalable synchronization technique for distributed virtual environments based on networked-server architectures",
year = 2006
}

J Domenech, J A Gil, Julio Sahuquillo and A Pont. DDG: An Efficient Prefetching Algorithm for Current Web Generation. In Hot Topics in Web Systems and Technologies, 2006. HOTWEB '06. 1st IEEE Workshop on. 2006, 1 -12. URL, DOI BibTeX

@conference{ 4178377,
	author = "J. Domenech and J.A. Gil and Sahuquillo, Julio and A. Pont",
	abstract = "Web prefetching is one of the techniques proposed to reduce user's perceived latencies in the World Wide Web. The spatial locality shown by user's accesses makes it possible to predict future accesses based on the previous ones. A prefetching engine uses these predictions to prefetch the Web objects before the user demands them. The existing prediction algorithms achieved an acceptable performance when they were proposed but the high increase in the amount of embedded objects per page has reduced their effectiveness in the current Web. In this paper we show that most of the predictions made by the existing algorithms are useless to reduce the user's perceived latency because these algorithms do not take into account how current Web pages are structured, i.e., an HTML object with several embedded objects. Thus, they predict the accesses to the embedded objects in an HTML after reading the HTML itself. For this reason, the prediction advance is not enough to prefetch the objects and therefore there is no latency reduction. As a result of a wide analysis of the behaviour of the most commonly used algorithms, in this paper we present the DDG algorithm that distinguishes between container objects (HTML) and embedded objects to create a new prediction model according to the structure of the current Web. Results show that, for the same amount of extra requests to the server, DDG always outperforms the existing algorithms by reducing the perceived latency between 15% and 150% more without increasing the computing complexity",
	booktitle = "Hot Topics in Web Systems and Technologies, 2006. HOTWEB '06. 1st IEEE Workshop on",
	doi = "10.1109/HOTWEB.2006.355260",
	isbn = "1-4244-0596-3",
	keywords = "HTML object;Web object prefetching;Web pages;World Wide Web;container objects;embedded objects;latency reduction;spatial locality;user access;Internet;hypermedia markup languages;information retrieval;storage management;",
	month = "nov.",
	pages = "1 -12",
	title = "{DDG}: {A}n {E}fficient {P}refetching {A}lgorithm for {C}urrent {W}eb {G}eneration",
	url = "http://dx.doi.org/10.1109/HOTWEB.2006.355260",
	year = 2006
}

J Domenech, Julio Sahuquillo, A Pont and J A Gil. Design Keys to Adapt Web Prefetching Algorithms to Environment Conditions. In Communication System Software and Middleware, 2006. Comsware 2006. First International Conference on. 2006, 1 -7. URL, DOI BibTeX

@conference{ 1665179,
	author = "J. Domenech and Sahuquillo, Julio and A. Pont and J.A. Gil",
	abstract = "This paper focuses on the design process of Web prefetching algorithms. The main goal of prefetching techniques in web is to reduce user perceived latency. Since these techniques present a high number of non-desired collateral effects that can negatively affect the system performance, the design process of new algorithms must be carefully performed. In a previous work we proposed some performance metrics to evaluate Web prefetching and introduced the byte recall index. In this work we present a statistical analysis which identifies how the environment conditions impact on the most significant indexes (recall and byte recall) used to evaluate prefetch algorithms. Our experimental results show that, depending on the user available bandwidth and the server processing time of each request, the recall is more correlated to the user's perceived latency than the byte recall and vice versa, so that we specify and suggest guidelines to adapt an algorithm to different environment conditions",
	booktitle = "Communication System Software and Middleware, 2006. Comsware 2006. First International Conference on",
	doi = "10.1109/COMSWA.2006.1665179",
	isbn = "0-7803-9575-1",
	keywords = "Web prefetching algorithm;byte recall index;nondesired collateral effect;server processing time;statistical analysis;Internet;statistical analysis;storage management;",
	month = "0-0",
	pages = "1 -7",
	title = "{D}esign {K}eys to {A}dapt {W}eb {P}refetching {A}lgorithms to {E}nvironment {C}onditions",
	url = "http://dx.doi.org/10.1109/COMSWA.2006.1665179",
	year = 2006
}

Teresa Nachiondo, Jose Flich and Jose Duato. Destination-based HoL blocking elimination. In Parallel and Distributed Systems, 2006. ICPADS 2006. 12th International Conference onParallel and Distributed Systems, 2006. ICPADS 2006. 12th International Conference on 1. 2006, 10 pp. -. URL, DOI BibTeX

@conference{ 9077844,
	author = "Nachiondo, Teresa and Flich, Jose and Duato, Jose",
	abstract = "In future interconnection networks, congestion management is likely to become a critical issue owing to increasing power consumption and cost concerns. As congested packets introduce head-of-line (HoL) blocking to the rest of packets, congestion spreads quickly. The best-known solution to HoL blocking, virtual output queues (VOQs), is not scalable at all or too costly when implemented in large networks. In previous works, we proposed an efficient and cost-effective solution, referred to as destination-based buffer management (DBBM). DBBM groups destinations into different sets, and packets addressed to destinations in the same set are mapped to the same queue. DBBM eliminates most of the HoL blocking (among packets addressed to different sets). It achieves very good results in terms of scalability, throughput, and robustness. However, depending on the distribution of packet destinations, it may introduce an uncertain degree of unfairness among packets mapped on the same queue. In order to overcome this problem, we propose the dynamic DBBM mechanism (DDBBM). DDBBM dynamically eliminates completely the HoL blocking. Performance results show that DDBBM keeps (and in some cases improves) the good results achieved by DBBM in terms of throughput and scalability. Moreover, DDBBM solves the unfairness introduced by DBBM. As an example of applicability, in this paper we show that DDBBM can be applied to InfiniBand with no hardware modification",
	booktitle = "Parallel and Distributed Systems, 2006. ICPADS 2006. 12th International Conference onParallel and Distributed Systems, 2006. ICPADS 2006. 12th International Conference on",
	doi = "10.1109/ICPADS.2006.34",
	isbn = "0-7695-2612-8",
	issn = "1521-9097",
	journal = "12th International Conference on Parallel and Distributed Systems",
	keywords = "buffer storage;computer network management;packet switching;queueing theory;telecommunication congestion control;",
	note = "destination-based HoL blocking elimination;interconnection network;network congestion management;head-of-line blocking;virtual output queues;dynamic destination-based buffer management;packet destination distribution;InfiniBand;",
	pages = "10 pp. -",
	title = "{D}estination-based {H}o{L} blocking elimination",
	url = "http://dx.doi.org/10.1109/ICPADS.2006.34",
	volume = 1,
	year = 2006
}

F O Sem-Jacobsen, T Skeie, O Lysne and Jose Duato. Dynamic fault tolerance with misrouting in fat trees. 2006, 10 pp. -. BibTeX

@conference{ 9165284,
	author = "F.O. Sem-Jacobsen and T. Skeie and O. Lysne and Duato, Jose",
	abstract = "Fault tolerance is critical for efficient utilisation of large computer systems. Dynamic fault tolerance allows the network to remain available through the occurance of faults as opposed to static fault tolerance which requires the network to be halted to reconfigure it. Although dynamic fault tolerance may lead to less efficient solutions than static fault tolerance, it allows for a much higher availability of the system. In this paper we devise a dynamic fault tolerant adaptive routing algorithm for the fat tree, a much used interconnect topology, which relies on misrouting around link faults. We show that we are guaranteed to tolerate any combination of less than (num_switch_ports)/2 link faults without the need for additional network resources for deadlock freedom. There is also a high probability of tolerating an even larger number of link faults. Simulation results show that network performance degrades very little when faults are dynamically tolerated",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings. 2006 International Conference on Parallel Processing",
	keywords = "fault tolerant computing;multiprocessor interconnection networks;telecommunication network routing;trees (mathematics);",
	note = "dynamic fault tolerance;fat tree;dynamic fault tolerant adaptive routing;interconnect topology;link fault misrouting;network performance;",
	pages = "10 pp. -",
	title = "{D}ynamic fault tolerance with misrouting in fat trees",
	year = 2006
}

Frank Olaf Sem-Jacobsen, Tor Skeie, Olav Lysne and Jose Duato. Dynamic fault tolerance with misrouting in fat trees. 2006, 33 - 42. URL BibTeX

@conference{ 20073110737857,
	author = "Frank Olaf Sem-Jacobsen and Tor Skeie and Olav Lysne and Duato, Jose",
	abstract = "Fault tolerance is critical for efficient utilisation of large computer systems. Dynamic fault tolerance allows the network to remain available through the occurance of faults as opposed to static fault tolerance which requires the network to be halted to reconfigure it. Although dynamic fault tolerance may lead to less efficient solutions than static fault tolerance, it allows for a much higher availability of the system. In this paper we devise a dynamic fault tolerant adaptive routing algorithm for the fat tree, a much used interconnect topology, which relies on misrouting around link faults. We show that we are guaranteed to tolerate any combination of less than num_switch_ports/2 link faults without the need for additional network resources for deadlock freedom. There is also a high probability of tolerating an even larger number of link faults. Simulation results show that network performance degrades very little when faults are dynamically tolerated. {{\&}}copy; 2006 IEEE.",
	address = "Columbus, OH, United states",
	issn = 01903918,
	journal = "Proceedings of the International Conference on Parallel Processing",
	key = "Fault tolerance",
	keywords = "Adaptive algorithms;Computer networks;Computer resource management;Computer simulation;Network routing;Trees (mathematics);",
	note = "Dynamic fault tolerance;Large computer systems;Link faults;",
	pages = "33 - 42",
	title = "{D}ynamic fault tolerance with misrouting in fat trees",
	url = "http://dx.doi.org/10.1109/ICPP.2006.36",
	year = 2006
}

Marina Alonso, Salvador Coll, Jose Maria Martínez, Vicente Santonja, Pedro Lopez and Jose Duato. Dynamic power saving in fat-tree interconnection networks using on/off links. In Parallel and Distributed Processing Symposium, 2006. IPDPS 2006. 20th International. April 2006, 8 pp.. URL, DOI BibTeX

@conference{ 1639599,
	author = "Alonso, Marina and Coll, Salvador and Mart{\'i}nez, Jose Maria and Santonja, Vicente and Lopez, Pedro and Duato, Jose",
	abstract = "Current trends in high-performance parallel computers show that fat-tree interconnection networks are one of the most popular topologies. The particular characteristics of this topology, that provide multiple alternative paths for each source/destination pair, make it an excellent candidate for applying power consumption reduction techniques. Such techniques are being increasingly applied in computer systems and the interconnection network is not an exception, since its contribution to the system power budget is not negligible. In this paper, we present a mechanism that dynamically switches on and off network links as a function of traffic. The mechanism is designed to guarantee network connectivity, according to the underlying routing algorithm. In this way, the default routing algorithm can be used regardless of the power saving actions taken, thus simplifying router design. Our simulation results show that significant network power consumption reductions can be obtained at no cost. Latency remains the same although the number of operating network links is dynamically adjusted.",
	booktitle = "Parallel and Distributed Processing Symposium, 2006. IPDPS 2006. 20th International",
	doi = "10.1109/IPDPS.2006.1639599",
	isbn = "0-7695-0990-8",
	keywords = "dynamic power saving; fat-tree interconnection networks; high-performance parallel computers; network power consumption reduction; on-off links; routing algorithm; energy conservation; multiprocessor interconnection networks; parallel processing;",
	month = "april",
	pages = "8 pp.",
	title = "{D}ynamic power saving in fat-tree interconnection networks using on/off links",
	url = "http://dx.doi.org/10.1109/IPDPS.2006.1639599",
	year = 2006
}

Marina Alonso, Salvador Coll, Juan Miguel Martínez, Vicente Santonja, Pedro Lopez and Jose Duato. Dynamic power saving in fat-tree interconnection networks using on/off links. In Parallel and Distributed Processing Symposium, 2006. IPDPS 2006. 20th International. 2006, 8 pp. -. URL, DOI BibTeX

@conference{ 8978456,
	author = "Alonso, Marina and Coll, Salvador and Mart{\'i}nez, Juan Miguel and Santonja, Vicente and Lopez, Pedro and Duato, Jose",
	abstract = "Current trends in high-performance parallel computers show that fat-tree interconnection networks are one of the most popular topologies. The particular characteristics of this topology, that provide multiple alternative paths for each source/destination pair, make it an excellent candidate for applying power consumption reduction techniques. Such techniques are being increasingly applied in computer systems and the interconnection network is not an exception, since its contribution to the system power budget is not negligible. In this paper, we present a mechanism that dynamically switches on and off network links as a function of traffic. The mechanism is designed to guarantee network connectivity, according to the underlying routing algorithm. In this way, the default routing algorithm can be used regardless of the power saving actions taken, thus simplifying router design. Our simulation results show that significant network power consumption reductions can be obtained at no cost. Latency remains the same although the number of operating network links is dynamically adjusted",
	booktitle = "Parallel and Distributed Processing Symposium, 2006. IPDPS 2006. 20th International",
	doi = "10.1109/IPDPS.2006.1639599",
	isbn = "1-4244-0054-6",
	journal = "Proceedings. 20th International Parallel and Distributed Processing Symposium (IEEE Cat. No.06TH8860)",
	keywords = "energy conservation;multiprocessor interconnection networks;parallel processing;trees;",
	month = "Apr.",
	note = "dynamic power saving;fat-tree interconnection networks;on/off links;high-performance parallel computers;routing algorithm;network power consumption reduction;",
	pages = "8 pp. -",
	publisher = "IEEE Computer Society",
	title = "{D}ynamic power saving in fat-tree interconnection networks using on/off links",
	url = "http://dx.doi.org/10.1109/IPDPS.2006.1639599",
	year = 2006
}

P J Garcia, F J Quiles, Jose Flich, Jose Duato, I Johnson and F Naven. Efficient, Scalable Congestion Management for Interconnection Networks. Micro, IEEE 26(5):52 -66, 2006. DOI BibTeX

@article{ 1709823,
	author = "P.J. Garcia and F.J. Quiles and Flich, Jose and Duato, Jose and I. Johnson and F. Naven",
	abstract = "Compared to the overdimensioned designs of the past, current interconnection networks operate closer to the point of saturation and run a higher risk of congestion. Among proposed strategies for congestion management, only the regional explicit congestion notification (RECN) mechanism achieves both the required efficiency and the scalability that emerging systems demand",
	doi = "10.1109/MM.2006.88",
	issn = "0272-1732",
	journal = "Micro, IEEE",
	keywords = "RECN mechanism;interconnection networks;regional explicit congestion notification;scalable congestion management;multiprocessor interconnection networks;",
	month = "sept.-oct.",
	number = 5,
	pages = "52 -66",
	title = "{E}fficient, {S}calable {C}ongestion {M}anagement for {I}nterconnection {N}etworks",
	volume = 26,
	year = 2006
}

Aurelio Bermudez, Rafael Casado, Francisco J Quiles and Jose Duato. Fast routing computation on InfiniBand networks. IEEE Transactions on Parallel and Distributed Systems 17(3):215 - 226, 2006. URL BibTeX

@article{ 2006079693287,
	author = "Aurelio Bermudez and Rafael Casado and Francisco J. Quiles and Duato, Jose",
	abstract = "The InfiniBand architecture has been proposed as a technology both for communication between processing nodes and I/O devices, and for interprocessor communication. Its specification defines a basic management infrastructure that is responsible for subnet configuration and fault tolerance. Each time a topology change is detected, new forwarding tables have to be computed and uploaded to devices. The time required to compute these tables is a critical issue, due to application traffic is negatively affected by the temporary lack of connectivity. In this paper, we show the way to integrate several routing algorithms, in order to combine their advantages. In particular, we merge a new proposal, characterized by its high computation speed but low efficiency, with a traditional one, slower but more efficient. Our goal is to provide new routes in a short period of time, minimizing the degradation mentioned before, and maintaining, at the same time, high network performance. {{\&}}copy; 2006 IEEE.",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Computer architecture",
	keywords = "Algorithms;Fault tolerant computer systems;Local area networks;Network protocols;Topology;",
	note = "Fast routing computation;High-speed LANs;InfiniBand networks;Network management;Network topology;Routing protocols;",
	number = 3,
	pages = "215 - 226",
	title = "{F}ast routing computation on {I}nfini{B}and networks",
	url = "http://dx.doi.org/10.1109/TPDS.2006.35",
	volume = 17,
	year = 2006
}

Maria E Gomez, Pedro Lopez and Jose Duato. FIR: An efficient routing strategy for tori and meshes. Journal of Parallel and Distributed Computing 66(7):907 - 21, 2006. URL, DOI BibTeX

@article{ 8981461,
	author = "Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "Recent massively parallel computers are based on clusters of PCs. These machines use one of the recently proposed standard interconnects. These interconnects either use source routing or distributed routing based on forwarding tables. While source routers are simpler, distributed routers provides more flexibility allowing the network to achieve a higher performance. Distributed routing can be implemented by a fixed hardware specific to a routing function on a given topology or by using forwarding tables. The main problem of this approach is the lack of scalability of forwarding tables. In this paper, we propose a distributed routing strategy for commercial switches, flexible interval routing, that is scalable, both in memory and routing time because it is not based on tables. At the same time, the strategy is easy to reconfigure, being able to implement the most commonly used routing algorithms in the most widely used regular topologies. [All rights reserved Elsevier]",
	address = "USA",
	doi = "10.1016/j.jpdc.2005.12.012",
	issn = "0743-7315",
	journal = "Journal of Parallel and Distributed Computing",
	keywords = "multiprocessor interconnection networks;telecommunication network routing;workstation clusters;",
	note = "FIR;flexible interval routing;network routing;PC clusters;network topology;",
	number = 7,
	pages = "907 - 21",
	title = "{FIR}: {A}n efficient routing strategy for tori and meshes",
	url = "http://dx.doi.org/10.1016/j.jpdc.2005.12.012",
	volume = 66,
	year = 2006
}

A Martinez, F J Alfaro, J L Sanchez and Jose Duato. Full QoS support with 2 VCs for single-chip switches. 2006, 4 pp. -. BibTeX

@conference{ 9077009,
	author = "A. Martinez and F.J. Alfaro and J.L. Sanchez and Duato, Jose",
	abstract = "Current interconnection standards providing hardware support for quality of service (QoS) consider up to 16 virtual channels (VCs) for this purpose. However, most implementations do not offer so many because VCs increase the complexity of the switch and the scheduling delays. We have shown that this number of VCs can be significantly reduced, because it is enough to use two VCs for QoS purposes at each switch port. In this paper, we explore two alternative switch designs that take advantage of this reduction",
	address = "Los Alamitos, CA, USA",
	journal = "5th IEEE International Symposium on Network Computing and Applications",
	keywords = "communication complexity;logic design;multiprocessor interconnection networks;network interfaces;network-on-chip;packet switching;quality of service;telecommunication channels;telecommunication traffic;",
	note = "single-chip switches;interconnection standard;quality of service;virtual channel;switch complexity;scheduling delay;switch design;network interface;",
	pages = "4 pp. -",
	title = "{F}ull {Q}o{S} support with 2 {VC}s for single-chip switches",
	year = 2006
}

Alejandro Martinez, Francisco J Alfaro, Jose L Sanchez and Jose Duato. Full QoS support with 2 VCs for single-chip switches. 2006, 239 - 242. URL BibTeX

@conference{ 20071710566900,
	author = "Alejandro Martinez and Francisco J. Alfaro and Jose L. Sanchez and Duato, Jose",
	abstract = "Current interconnection standards providing hardware support for quality of service (QoS) consider up to 16 virtual channels (VCs) for this purpose. However, most implementations do not offer so many because VCs increase the complexity of the switch and the scheduling delays. We have shown that this number of VCs can be significantly reduced, because it is enough to use two VCs for QoS purposes at each switch port. In this paper, we explore two alternative switch designs that take advantage of this reduction. {{\&}}copy; 2006 IEEE.",
	address = "Cambridge, MA, United states",
	journal = "Proceedings - Fifth IEEE International Symposium on Network Computing and Applications, NCA 2006",
	key = "Quality of service",
	keywords = "Computational complexity;Computer hardware;Interconnection networks;Switching systems;",
	note = "Hardware support;Interconnection standards;Scheduling delays;Virtual channels (VC);",
	pages = "239 - 242",
	title = "{F}ull {Q}o{S} support with 2 {VC}s for single-chip switches",
	url = "http://dx.doi.org/10.1109/NCA.2006.33",
	volume = 2006,
	year = 2006
}

B Caminero, C Carrion, F J Quiles, Jose Duato and S Yalamanchili. MMR: A MultiMedia Router architecture to support hybrid workloads. Journal of Parallel and Distributed Computing 66(2):307 - 321, 2006. URL BibTeX

@article{ 2006039654031,
	author = "B. Caminero and C. Carrion and F.J. Quiles and Duato, Jose and S. Yalamanchili",
	abstract = "During the past few years, multimedia traffic with quality of service (QoS) requirements has become of widespread use. Media servers are usually built as clusters of workstations connected by a high-performance interconnection network. However, these high-performance networks do not usually offer differentiated support for multimedia traffic. The MultiMedia Router (MMR) is a proposal to address the QoS issue in cluster networks within a compact architecture, while also integrating conventional best-effort traffic. This paper describes the main architectural features of the MMR, such as the use of a hybrid switching technique, credit-based flow control, or small input buffers. Also, the main design parameters are tuned by means of simulation. It can be seen how proper differentiation among the different traffic classes is achieved, while retaining a compact design with small buffers. {{\&}}copy; 2005 Elsevier Inc. All rights reserved.",
	issn = 07437315,
	journal = "Journal of Parallel and Distributed Computing",
	key = "Hybrid computers",
	keywords = "Buffer circuits;Computer simulation;Flow control;Interconnection networks;Quality of service;Servers;Telecommunication traffic;",
	note = "Clusters of workstations (COW);Hybrid switching technique;Multimedia transmissions;Router architecture;",
	number = 2,
	pages = "307 - 321",
	title = "{MMR}: {A} {M}ulti{M}edia {R}outer architecture to support hybrid workloads",
	url = "http://dx.doi.org/10.1016/j.jpdc.2005.10.002",
	volume = 66,
	year = 2006
}

J -C Cano, José Cano Reyes, P Manzoni and Dongkyun Kim. On the design of pervasive computing applications based on Bluetooth and a P2P concept. In Wireless Pervasive Computing, 2006 1st International Symposium on. 2006, 6 pp.. URL, DOI BibTeX

@conference{ 1613599,
	author = "J. -C. Cano and Cano Reyes, Jos{\'e} and P. Manzoni and Dongkyun Kim",
	abstract = "As an example of implementing the pervasive computing, we developed an application framework to deploy an easy, spontaneous, and infrastructureless network. We selected the Bluetooth technology with the peer-to-peer (P2P) concept to develop an experimental application which enables peers to exchange their resources. Overall network architecture and the prototype application are presented. We selected a small test-bed and simulation to evaluate the overall performance and system behaviour. We measured and present our findings in term of the duration of inquiry procedure and the throughput, according to various experimental parameters such as physical distance between nodes and their speeds.",
	booktitle = "Wireless Pervasive Computing, 2006 1st International Symposium on",
	doi = "10.1109/ISWPC.2006.1613599",
	isbn = "0-7803-9410-0",
	keywords = "Bluetooth; P2P concept; network architecture; peer-to-peer; pervasive computing applications; Bluetooth; peer-to-peer computing; ubiquitous computing;",
	month = "jan.",
	pages = "6 pp.",
	publisher = "IEEE Computer Society",
	title = "{O}n the design of pervasive computing applications based on {B}luetooth and a {P}2{P} concept",
	url = "http://dx.doi.org/10.1109/ISWPC.2006.1613599",
	year = 2006
}

Francisco Gilabert, Maria E Gomez, Pedro Lopez and Jose Duato. On the influence of the selection function on the performance of fat-trees. 2006, 864 - 73. BibTeX

@conference{ 9112992,
	author = "Gilabert, Francisco and Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "Fat-tree topology has become very popular among switch manufacturers. Routing in fat-trees is composed of two phases, an adaptive upwards phase, and a deterministic downwards phase. The unique downwards path to the destination depends on the switch that has been reached in the upwards phase. As adaptive routing is used in the ascending phase, several output ports are possible at each switch and the final choice depends on the selection function. The impact of the selection function on performance has been previously studied for direct networks and has not resulted to be very important. In fat-trees, the decisions made in the upwards phase by the selection function can be critical, since it determines the switch reached in the upwards phase, and therefore the unique downwards path to the destination. In this paper, we analyze the effect of the selection function on fat-trees. Several selection functions are defined, compared and evaluated. The evaluation shows that selection function has a great impact on fat-trees",
	address = "Berlin, Germany",
	journal = "Euro-Par 2006 Parallel Processing. 12th International Euro-Par Conference. Proceedings (Lecture Notes in Computer Science Vol. 4128)",
	keywords = "telecommunication network routing;telecommunication network topology;telecommunication switching;trees;",
	note = "selection function;fat-trees;adaptive routing;interconnection networks;",
	pages = "864 - 73",
	title = "{O}n the influence of the selection function on the performance of fat-trees",
	year = 2006
}

P Morillo, W Moncho, J M Orduria and Jose Duato. Providing full awareness to distributed virtual environments based on peer-to-peer architectures. 2006, 336 - 47. BibTeX

@conference{ 9027149,
author = "P. Morillo and W. Moncho and J.M. Orduria and Duato, Jose",
abstract = "Large scale distributed virtual environments (DVEs) have become a major trend in distributed applications, mainly due to the enormous popularity of multiplayer online games in the entertainment industry. Since architectures based on networked servers seem to be not scalable enough to support massively multiplayer applications, peer-to-peer (P2P) architectures have been proposed as an efficient and truly scalable solution for this kind of systems. However, the main challenge of P2P architectures consists of providing each avatar with updated information about which other avatars are its neighbors. We have denoted this problem as the awareness problem. Although some proposals have been made, none of them provide total awareness to avatars under any situation. This paper presents a new awareness method based on unicast communication that is capable of providing awareness to 100% of avatars, regardless of both their location and their movement pattern in the virtual world. Therefore, it allows large scale DVEs based on P2P architectures to properly scale with the number of users while fully providing awareness to all of them",
address = "Berlin, Germany",
journal = "Advances in Computer Graphics. 24th Computer Graphics International Conference, CGI 2006. Proceedings (Lecture Notes in Computer Science Vol.4035)",
keywords = "avatars;peer-to-peer computing;",
note = "peer-to-peer architectures;large scale distributed virtual environments;P2P architectures;avatar;unicast communication;",
pages = "336 - 47",
title = "{P}roviding full awareness to distributed virtual environments based on peer-to-peer architectures",
year = 2006
}

P Morillo, W Moncho, J M Orduna and Jose Duato. Providing full awareness to distributed virtual environments based on peer-to-peer architectures. 2006, 336 - 347. BibTeX

@conference{ 20063010029819,
	author = "P. Morillo and W. Moncho and J.M. Orduna and Duato, Jose",
	abstract = "In recent years, large scale distributed virtual environments (DVEs) have become a major trend in distributed applications, mainly due to the enormous popularity of multiplayer online games in the entertainment industry. Since architectures based on networked servers seems to be not scalable enough to support massively multiplayer applications, peer-to-peer (P2P) architectures have been proposed as an efficient and truly scalable solution for this kind of systems. However, the main challenge of P2P architectures consists of providing each avatar with updated information about which other avatars are its neighbors. We have denoted this problem as the awareness problem. Although some proposals have been made, none of them provide total awareness to avatars under any situation. This paper presents a new awareness method based on unicast communication that is capable of providing awareness to 100% of avatars, regardless of both their location and their movement pattern in the virtual world. Therefore, it allows large scale DVEs based on P2P architectures to properly scale with the number of users while fully providing awareness to all of them. {{\&}}copy; Springer-Verlag Berlin Heidelberg 2006.",
	address = "Hangzhou, China",
	issn = 03029743,
	journal = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
	key = "Virtual reality",
	keywords = "Communication systems;Computer architecture;Computer graphics;Computer science;Information technology;Online systems;",
	note = "Distributed virtual environments (DVE);Multiplayer online games;Peer-to-peer (P2P) architectures;",
	pages = "336 - 347",
	title = "{P}roviding full awareness to distributed virtual environments based on peer-to-peer architectures",
	volume = "4035 LNCS",
	year = 2006
}

A Martinez, G Apostolopoulos, F J Alfaro, J L Sanchez and Jose Duato. QoS support for video transmission in high-speed interconnects. 2006, 631 - 641. BibTeX

@conference{ 20064410206368,
	author = "A. Martinez and G. Apostolopoulos and F.J. Alfaro and J.L. Sanchez and Duato, Jose",
	abstract = "Multimedia traffic presents some special requirements that are unattainable with a best-effort service. Current interconnect standards provide mechanisms to overcome the limitations of the best-effort model, but they do not suffice to satisfy the strict requirements of video transmissions. This problem has been extensively addressed at the general networking community. Several solutions have arisen, but they are too complex to be applied to high speed-interconnects. In this paper, we propose a network architecture that is at the same time compatible with the requirements of high-speed interconnects and provides video traffic with the QoS it demands. {{\&}}copy; Springer-Verlag Berlin Heidelberg 2006.",
	address = "Munich, Germany",
	issn = 03029743,
	journal = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
	key = "Video signal processing",
	keywords = "Communication channels (information theory);Multimedia systems;Optical interconnects;Quality of service;Scheduling;Telecommunication traffic;",
	note = "Clusters;Switch design;Video transmissions;Virtual channels;",
	pages = "631 - 641",
	title = "{Q}o{S} support for video transmission in high-speed interconnects",
	volume = "4208 LNCS",
	year = 2006
}

A Martinez, G Apostolopoulos, F J Alfaro, J L Sanchez and Jose Duato. QoS support for video transmission in high-speed interconnects. 2006, 631 - 41. BibTeX

@conference{ 9132719,
	author = "A. Martinez and G. Apostolopoulos and F.J. Alfaro and J.L. Sanchez and Duato, Jose",
	abstract = "Multimedia traffic presents some special requirements that are unattainable with a best-effort service. Current interconnect standards provide mechanisms to overcome the limitations of the best-effort model, but they do not suffice to satisfy the strict requirements of video transmissions. This problem has been extensively addressed at the general networking community. Several solutions have arisen, but they are too complex to be applied to high speed-interconnects. In this paper, we propose a network architecture that is at the same time compatible with the requirements of high-speed interconnects and provides video traffic with the QoS it demands",
	address = "Berlin, Germany",
	journal = "High Performance Computing and Communications. Second International Conference, HPCC 2006. Proceedings (Lecture Notes in Computer Science Vol.4208)",
	keywords = "multimedia communication;quality of service;telecommunication traffic;video communication;",
	note = "QoS;video transmission;high-speed interconnects;multimedia traffic;network architecture;",
	pages = "631 - 41",
	title = "{Q}o{S} support for video transmission in high-speed interconnects",
	year = 2006
}

Rafael Ubal, José Cano Reyes, Salvador Petit and Julio Sahuquillo. RACFP: a training tool to work with floating-point representation, algorithms, and circuits in undergraduate courses. Education, IEEE Transactions on 49(3):321 -331, 2006. URL, DOI BibTeX

@article{ 1668276,
	author = "Ubal, Rafael and Cano Reyes, Jos{\'e} and Petit, Salvador and Sahuquillo, Julio",
	abstract = "The design of pedagogical tools to train students is an interesting challenge for academic instructors in any educational area. Some approaches have appeared focusing on computer arithmetic, both integer and floating point. Floating-point arithmetic involves much more complexity; nevertheless, little time is usually devoted to this topic in computer engineering undergraduate courses. In this paper, RACFP is proposed as a pedagogical tool to work with floating-point in undergraduate courses. The tool has been designed with three abstraction levels according to the following learning outcomes: representation, arithmetic operation algorithms, and manufactured hardware circuits. The abstraction levels work independently, allowing for the use of RACFP in other courses, such as discrete mathematics or numerical methods, in which floating representation and related issues are also learning topics. RACFP design pursues two main goals: to minimize the complexity of the learning process and to encourage students when working with floating point. The first goal is achieved as a result of the multilevel design of the tool, while the second goal is achieved as RACFP shows how manufactured hardware implements generic algorithms",
	doi = "10.1109/TE.2006.879240",
	issn = "0018-9359",
	journal = "Education, IEEE Transactions on",
	keywords = "RACFP;computer engineering;floating-point algorithms;floating-point circuits;floating-point representation;pedagogical tool;training tools;undergraduate courses;computer science education;educational aids;educational courses;floating point arithmetic;trai",
	month = "aug.",
	number = 3,
	pages = "321 -331",
	title = "{RACFP}: a training tool to work with floating-point representation, algorithms, and circuits in undergraduate courses",
	url = "http://dx.doi.org/10.1109/TE.2006.879240",
	volume = 49,
	year = 2006
}

J M Montañana, Jose Flich, Antonio Robles and Jose Duato. Reachability-based fault-tolerant routing. In Parallel and Distributed Systems, 2006. ICPADS 2006. 12th International Conference on 1. 2006, 10 pp.. URL, DOI BibTeX

@conference{ 1655699,
	author = "Monta{\~n}ana, J. M. and Flich, Jose and Robles, Antonio and Duato, Jose",
	abstract = "Clusters of PCs are being used as cost-effective alternative to large parallel computers. In most of them it is critical to keep the system running even in the presence of faults. As the number of nodes increases in these systems, the interconnection network grows accordingly. Along with the increase in components the probability of faults increases dramatically, and thus, fault-tolerance in the system, in general, and in the interconnection network, in particular, plays a key role. An interesting approach to provide fault-tolerance consists of migrating on fly the paths affected by the failure to new fault-free paths. In this paper, we propose a simple and effective fault-tolerant routing methodology, referred to as reachability based fault tolerant routing (RFTR), that can be applied to any topology. RFTR builds new alternative paths by joining subpaths extracted from the set of already computed paths, thus being time-efficient. In order to avoid deadlocks, RFTR performs, if required, a virtual channel transition on the subpath union. As an example of applicability, in this paper we apply RFTR to InfiniBand. Evaluation results on tori show that RFTR exhibits a low computation cost and does not degrade performance significantly",
	booktitle = "Parallel and Distributed Systems, 2006. ICPADS 2006. 12th International Conference on",
	doi = "10.1109/ICPADS.2006.89",
	isbn = "0-7695-2612-8",
	issn = "1521-9097",
	keywords = "PC clusters;interconnection network;parallel computers;reachability-based fault-tolerant routing;virtual channel transition;fault tolerant computing;reachability analysis;telecommunication network routing;workstation clusters;",
	month = "0-0",
	pages = "10 pp.",
	title = "{R}eachability-based fault-tolerant routing",
	url = "http://dx.doi.org/10.1109/ICPADS.2006.89",
	volume = 1,
	year = 2006
}

P J Garcia, F J Quiles, Jose Flich, Jose Duato and I Johnson. RECN-DD: A Memory-Efficient Congestion Management Technique for Advanced Switching. In Parallel Processing, 2006. ICPP 2006. International Conference on. 2006, 23 -32. DOI BibTeX

@conference{ 1690602,
	author = "P.J. Garcia and F.J. Quiles and Flich, Jose and Duato, Jose and I. Johnson",
	abstract = "As VLSI technology advances, the interconnection network represents a larger percentage of the total system cost and power consumption. In fact, a current trend in network design is to reduce the number of components. However, this leads to systems working closer to saturation point, and therefore an efficient congestion management technique is required. In that sense, RECN has been recently proposed for advanced switching (AS). RECN detects the formation of congestion trees and dynamically allocates queues for storing congested packets, thus, eliminating the HOL blocking introduced by congestion trees. These queues are deallocated when congestion vanishes. We have identified two shortcomings that may affect RECN scalability and implementation. Firstly, although RECN allocates queues in an efficient way, resource deallocation is performed in-order, thus losing efficiency and wasting resources. This leads to an excessive requirement of memory at switch ports. Secondly, both allocation and deallocation mechanisms involve the use of specific control packets not supported by the AS standard, thus preventing RECN implementation. In this sense we provide a detailed description of the current RECN deallocation mechanism. In this paper we present an enhanced RECN version (RECN-DD) where these problems have been eliminated. Specifically, we propose a new distributed queue deallocation mechanism that reduces the number of required resources and does not require the use of control packets. Moreover, we propose a new congestion notification mechanism that does not require non-standard AS packets. Instead, flow control packets are used to notify congestion, thus simplifying the implementation of RECN-DD in AS",
	booktitle = "Parallel Processing, 2006. ICPP 2006. International Conference on",
	doi = "10.1109/ICPP.2006.62",
	issn = "0190-3918",
	keywords = "distributed queue deallocation;flow control packet;memory-efficient congestion management;regional explicit congestion notification;resource deallocation;multiprocessor interconnection networks;packet switching;queueing theory;telecommunication congestion",
	month = "14-18",
	pages = "23 -32",
	title = "{RECN}-{DD}: {A} {M}emory-{E}fficient {C}ongestion {M}anagement {T}echnique for {A}dvanced {S}witching",
	year = 2006
}

A Martinez, F J Alfaro, J L Sanchez and Jose Duato. Scalable low-cost QoS support for single-chip switches. 2006, 8 pp. -. BibTeX

@conference{ 9077868,
	author = "A. Martinez and F.J. Alfaro and J.L. Sanchez and Duato, Jose",
	abstract = "Virtual channels (VCs) are a popular solution for the provision of quality of service (QoS). Current interconnect standards propose 16 or even more VCs for this purpose. However, most commercial implementations do not offer so many VCs because it is too expensive in terms of silicon area. Therefore, a reduction of the number of VCs necessary to support QoS can be very helpful in the switch design and implementation. We have shown that this number of VCs can be reduced if the system is considered as a whole rather than each element being taken separately. Some of the scheduling decisions made at network interfaces can be easily reused at switches without significantly altering the global behavior. In this paper, our aim is to explore the scalability of the technique, considering the restrictions of the final chip implementation",
	address = "Los Alamitos, CA, USA",
	journal = "12th International Conference on Parallel and Distributed Systems",
	keywords = "network interfaces;performance evaluation;quality of service;scheduling;telecommunication switching;workstation clusters;",
	note = "QoS support;single-chip switches;virtual channels;interconnect standards;switch design;scheduling decisions;network interfaces;interconnection networks;storage area network;performance evaluation;",
	pages = "8 pp. -",
	title = "{S}calable low-cost {Q}o{S} support for single-chip switches",
	year = 2006
}

A Martinez, F J Alfaro, J L Sanchez and Jose Duato. Scalable low-cost QoS support for single-chip switches. 2006, 439 - 446. URL BibTeX

@conference{ 20071510539214,
	author = "A. Martinez and F.J. Alfaro and J.L. Sanchez and Duato, Jose",
	abstract = "Virtual channels (VCs) are a popular solution for the provision of quality of service (QoS). Current interconnect standards propose 16 or even more VCs for this purpose. However, most commercial implementations do not offer so many VCs because it is too expensive in terms of silicon area. Therefore, a reduction of the number of VCs necessary to support QoS can be very helpful in the switch design and implementation. We have shown that this number of VCs can be reduced if the system is considered as a whole rather than each element being taken separately. Some of the scheduling decisions made at network interfaces can be easily reused at switches without significantly altering the global behavior. In this paper, our aim is to explore the scalability of the technique, considering the restrictions of the final chip implementation. {{\&}}copy; 2006 IEEE.",
	address = "Minneapolis, MN, United states",
	issn = 15219097,
	journal = "Proceedings of the International Conference on Parallel and Distributed Systems - ICPADS",
	key = "Semiconductor switches",
	keywords = "Clustering algorithms;Interconnection networks;Microprocessor chips;Quality of service;Scheduling algorithms;Silicon;",
	note = "Performance evaluation;Switch design;Virtual channels (VCs);",
	pages = "439 - 446",
	title = "{S}calable low-cost {Q}o{S} support for single-chip switches",
	url = "http://dx.doi.org/10.1109/ICPADS.2006.110",
	volume = 1,
	year = 2006
}

, Jose Flich, Jose Duato, S -A Reinemo and T Skeie. Segment-based routing: an efficient fault-tolerant routing algorithm for meshes and tori. In Parallel and Distributed Processing Symposium, 2006. IPDPS 2006. 20th International. April 2006, 10 pp.. URL, DOI BibTeX

@conference{ 1639341,
author = ", and Flich, Jose and Duato, Jose and S.-A. Reinemo and T. Skeie",
abstract = "Computers get faster every year, but the demand for computing resources seems to grow at an even faster rate. Depending on the problem domain, this demand for more power can be satisfied by either, massively parallel computers, or clusters of computers. Common for both approaches is the dependence on high performance interconnect networks such as Myrinet, Infiniband, or 10 Gigabit Ethernet. While high throughput and low latency are key features of interconnection networks, the issue of fault-tolerance is now becoming increasingly important. As the number of network components grows so does the probability for failure, thus it becomes important to also consider the fault-tolerance mechanism of interconnection networks. The main challenge then lies in combining performance and fault-tolerance, while still keeping cost and complexity low. This paper proposes a new deterministic routing methodology for tori and meshes, which achieves high performance without the use of virtual channels. Furthermore, it is topology agnostic in nature, meaning it can handle any topology derived from any combination of faults when combined with static reconfiguration. The algorithm, referred to as segment-based routing (SR), works by partitioning a topology into subnets, and subnets into segments. This allows us to place bidirectional turn restrictions locally within a segment. As segments are independent, we gain the freedom to place turn restrictions within a segment independently from other segments. This results in a larger degree of freedom when placing turn restrictions compared to other routing strategies. In this paper a way to compute segment-based routing tables is presented and applied to meshes and tori. Evaluation results show that SR increases performance by a factor of 1.8 over FX and up*/down* routing",
booktitle = "Parallel and Distributed Processing Symposium, 2006. IPDPS 2006. 20th International",
doi = "10.1109/IPDPS.2006.1639341",
keywords = "deterministic routing;fault-tolerant routing;interconnection networks;meshes;segment-based routing;tori;fault tolerant computing;multiprocessor interconnection networks;telecommunication network routing;telecommunication network topology;",
month = "april",
pages = "10 pp.",
title = "{S}egment-based routing: an efficient fault-tolerant routing algorithm for meshes and tori",
url = "http://dx.doi.org/10.1109/IPDPS.2006.1639341",
year = 2006
}

@conference{ 8969869,
author = ", and Flich, Jose and Duato, Jose and S.-A. Reinemo and T. Skeie",
abstract = "Computers get faster every year, but the demand for computing resources seems to grow at an even faster rate. Depending on the problem domain, this demand for more power can be satisfied by either, massively parallel computers, or clusters of computers. Common for both approaches is the dependence on high performance interconnect networks such as Myrinet, Infiniband, or 10 Gigabit Ethernet. While high throughput and low latency are key features of interconnection networks, the issue of fault-tolerance is now becoming increasingly important. As the number of network components grows so does the probability for failure, thus it becomes important to also consider the fault-tolerance mechanism of interconnection networks. The main challenge then lies in combining performance and fault-tolerance, while still keeping cost and complexity low. This paper proposes a new deterministic routing methodology for tori and meshes, which achieves high performance without the use of virtual channels. Furthermore, it is topology agnostic in nature, meaning it can handle any topology derived from any combination of faults when combined with static reconfiguration. The algorithm, referred to as segment-based routing (SR), works by partitioning a topology into subnets, and subnets into segments. This allows us to place bidirectional turn restrictions locally within a segment. As segments are independent, we gain the freedom to place turn restrictions within a segment independently from other segments. This results in a larger degree of freedom when placing turn restrictions compared to other routing strategies. In this paper a way to compute segment-based routing tables is presented and applied to meshes and tori. Evaluation results show that SR increases performance by a factor of 1.8 over FX and up*/down* routing",
address = "Piscataway, NJ, USA",
booktitle = "Parallel and Distributed Processing Symposium, 2006. IPDPS 2006. 20th International",
doi = "10.1109/IPDPS.2006.1639341",
journal = "Proceedings. 20th International Parallel and Distributed Processing Symposium (IEEE Cat. No.06TH8860)",
keywords = "fault tolerant computing;multiprocessor interconnection networks;telecommunication network routing;telecommunication network topology;",
note = "segment-based routing;fault-tolerant routing;meshes;tori;interconnection networks;deterministic routing;",
pages = "10 pp. -",
title = "{S}egment-based routing: an efficient fault-tolerant routing algorithm for meshes and tori",
url = "http://dx.doi.org/10.1109/IPDPS.2006.1639341",
year = 2006
}

J Domenech, Julio Sahuquillo, J A Gil and A Pont. The Impact of the Web Prefetching Architecture on the Limits of Reducing User's Perceived Latency. In Web Intelligence, 2006. WI 2006. IEEE/WIC/ACM International Conference on. 2006, 740 -744. URL, DOI BibTeX

@conference{ 4061463,
	author = "J. Domenech and Sahuquillo, Julio and J.A. Gil and A. Pont",
	abstract = "Web prefetching is a technique that has been researched for years to reduce the latency perceived by users. For this purpose, several Web prefetching architectures have been used, but no comparative study has been performed to identify the best architecture dealing with prefetching. This paper analyzes the impact of the Web prefetching architecture focusing on the limits of reducing the user's perceived latency. To this end, the factors that constrain the predictive power of each architecture are analyzed and these theoretical limits are quantified. Experimental results show that the best element of the Web architecture to locate a single prediction engine is the proxy, whose implementation could reduce the perceived latency up to 67%. Schemes for collaborative predictors located at diverse elements of the Web architecture are also analyzed. These predictors could dramatically reduce the perceived latency, reaching a potential limit of about 97% for a mixed proxy-server collaborative prediction engine",
	booktitle = "Web Intelligence, 2006. WI 2006. IEEE/WIC/ACM International Conference on",
	doi = "10.1109/WI.2006.166",
	isbn = "0-7695-2747-7",
	keywords = "Web prefetching architecture;mixed proxy-server collaborative prediction engine;user perceived latency;Internet;groupware;online front-ends;search engines;",
	month = "dec.",
	pages = "740 -744",
	title = "{T}he {I}mpact of the {W}eb {P}refetching {A}rchitecture on the {L}imits of {R}educing {U}ser's {P}erceived {L}atency",
	url = "http://dx.doi.org/10.1109/WI.2006.166",
	year = 2006
}

A Martinez, P J Garcia, F J Alfaro, J L Sanchez, Jose Flich, F J Quiles and Jose Duato. Towards a cost-effective interconnection network architecture with QoS and congestion management support. 2006, 884 - 95. BibTeX

@conference{ 9112994,
	author = "A. Martinez and P.J. Garcia and F.J. Alfaro and J.L. Sanchez and Flich, Jose and F.J. Quiles and Duato, Jose",
	abstract = "Congestion management and quality of service (QoS) provision are two important issues in current network design. The most popular techniques proposed for both issues require the existence of specific resources in the interconnection network, usually a high number of separate queues at switch ports. Therefore, the implementation of these techniques is expensive or even in feasible. However, two novel, efficient, and cost-effective techniques for provision of QoS and for congestion management have been proposed recently. In this paper, we combine those techniques to build a single interconnection network architecture, providing an excellent performance while reducing the number of required resources",
	address = "Berlin, Germany",
	journal = "Euro-Par 2006 Parallel Processing. 12th International Euro-Par Conference. Proceedings (Lecture Notes in Computer Science Vol. 4128)",
	keywords = "interconnections;quality of service;telecommunication congestion control;",
	note = "cost-effective interconnection network;quality of service;congestion management;switch port;",
	pages = "884 - 95",
	title = "{T}owards a cost-effective interconnection network architecture with {Q}o{S} and congestion management support",
	year = 2006
}

Gaspar Mora, Jose Flich, Jose Duato, Pedro Lopez, Elvira Baydal and O Lysne. Towards an efficient switch architecture for high-radix switches. 2006, 11 - 20. URL, DOI BibTeX

@conference{ 10091275,
	author = "Mora, Gaspar and Flich, Jose and Duato, Jose and Lopez, Pedro and Baydal, Elvira and O. Lysne",
	abstract = "The interconnection network plays a key role in the overall performance achieved by high performance computing systems, also contributing an increasing fraction of its cost and power consumption. Current trends in interconnection network technology suggest that high-radix switches will be preferred as networks will become smaller (in terms of switch count) with the associated savings in packet latency, cost, and power consumption. Unfortunately, current switch architectures have scalability problems that prevent them from being effective when implemented with a high number of ports. In this paper, an efficient and cost-effective architecture for high-radix switches is proposed. The architecture, referred to as partitioned crossbar input queued (PCIQ), relies on three key components: a partitioned crossbar organization that allows the use of simple arbiters and crossbars, a packet-based arbiter, and a mechanism to eliminate the switch-level HOL blocking. Under uniform traffic, maximum switch efficiency is achieved. Furthermore, switch-level HOL blocking is completely eliminated under hot-spot traffic, again delivering maximum throughput. Additionally, PCIQ inherently implements an efficient congestion management technique that eliminates all the network-wide HOL blocking. On the contrary, the previously proposed architectures either show poor performance or they require significantly higher costs than PCIQ (in both components and complexity).",
	address = "Piscataway, NJ, USA",
	doi = "10.1109/ANCS.2006.4579519",
	journal = "ACM/IEEE Symposium on Architectures for Networking and Communications Systems (ANCS 2006)",
	keywords = "multistage interconnection networks;",
	note = "high-radix switch architecture;interconnection network;power consumption;partitioned crossbar input queued;switch-level head-of-line block elimination;congestion management technique;",
	pages = "11 - 20",
	title = "{T}owards an efficient switch architecture for high-radix switches",
	url = "http://dx.doi.org/10.1109/ANCS.2006.4579519",
	year = 2006
}