Publications 2001-2005

Gaspar Mora, Jose Flich, Jose Duato, Pedro Lopez, Elvira Baydal and O Lysne. Towards an efficient switch architecture for high-radix switches. 2006, 11 - 20. URL, DOI BibTeX

@conference{ 10091275,
	author = "Mora, Gaspar and Flich, Jose and Duato, Jose and Lopez, Pedro and Baydal, Elvira and O. Lysne",
	abstract = "The interconnection network plays a key role in the overall performance achieved by high performance computing systems, also contributing an increasing fraction of its cost and power consumption. Current trends in interconnection network technology suggest that high-radix switches will be preferred as networks will become smaller (in terms of switch count) with the associated savings in packet latency, cost, and power consumption. Unfortunately, current switch architectures have scalability problems that prevent them from being effective when implemented with a high number of ports. In this paper, an efficient and cost-effective architecture for high-radix switches is proposed. The architecture, referred to as partitioned crossbar input queued (PCIQ), relies on three key components: a partitioned crossbar organization that allows the use of simple arbiters and crossbars, a packet-based arbiter, and a mechanism to eliminate the switch-level HOL blocking. Under uniform traffic, maximum switch efficiency is achieved. Furthermore, switch-level HOL blocking is completely eliminated under hot-spot traffic, again delivering maximum throughput. Additionally, PCIQ inherently implements an efficient congestion management technique that eliminates all the network-wide HOL blocking. On the contrary, the previously proposed architectures either show poor performance or they require significantly higher costs than PCIQ (in both components and complexity).",
	address = "Piscataway, NJ, USA",
	doi = "10.1109/ANCS.2006.4579519",
	journal = "ACM/IEEE Symposium on Architectures for Networking and Communications Systems (ANCS 2006)",
	keywords = "multistage interconnection networks;",
	note = "high-radix switch architecture;interconnection network;power consumption;partitioned crossbar input queued;switch-level head-of-line block elimination;congestion management technique;",
	pages = "11 - 20",
	title = "{T}owards an efficient switch architecture for high-radix switches",
	url = "http://dx.doi.org/10.1109/ANCS.2006.4579519",
	year = 2006
}

Salvador Petit, Julio Sahuquillo and A Pont. A comparison study of the HLRC-DU protocol versus a HLRC hardware assisted protocol. In Parallel, Distributed and Network-Based Processing, 2005. PDP 2005. 13th Euromicro Conference on. 2005, 197 - 204. URL, DOI BibTeX

@conference{ 1386059,
	author = "Petit, Salvador and Sahuquillo, Julio and A. Pont",
	abstract = "SVM systems are a cheaper and flexible way to implement the shared memory programming paradigm. Their huge flexibility is due to their software implementation; however, this is also the main responsible of their performance drawbacks with respect to hardware systems. In this paper we compare a pure software HLRC protocol called the HLRC-DU, versus an improved version of the HLRC protocol that uses hardware support to reduce asynchronous communication. Performances of both protocols are compared over a baseline HLRC protocol. Results show that, by on the half of the benchmarks, our protocol performs better than the hardware approach, even more, in some cases our protocol reaches a speedup higher than 22% with respect to the baseline protocol.",
	booktitle = "Parallel, Distributed and Network-Based Processing, 2005. PDP 2005. 13th Euromicro Conference on",
	doi = "10.1109/EMPDP.2005.2",
	isbn = "0-7695-2280-7",
	issn = "1066-6192",
	keywords = "HLRC hardware assisted protocol; SVM systems; asynchronous communication; memory consistency protocols; shared memory programming paradigm; shared virtual memory systems; software HLRC-DU protocol; distributed programming; memory protocols; microprogramm",
	month = "feb.",
	pages = "197 - 204",
	title = "{A} comparison study of the {HLRC}-{DU} protocol versus a {HLRC} hardware assisted protocol",
	url = "http://dx.doi.org/10.1109/EMPDP.2005.2",
	year = 2005
}

Elvira Baydal, Pedro Lopez and Jose Duato. A family of mechanisms for congestion control in wormhole networks. Parallel and Distributed Systems, IEEE Transactions on 16(9):772 - 784, 2005. URL, DOI BibTeX

@article{ 1490509,
	author = "Baydal, Elvira and Lopez, Pedro and Duato, Jose",
	abstract = "Multiprocessor interconnection networks may reach congestion with high traffic loads, which prevents reaching the wished performance. Unfortunately, many of the mechanisms proposed in the literature for congestion control either suffer from a lack of robustness, being unable to work properly with different traffic patterns or message lengths, or detect congestion relying on global information that wastes some network bandwidth. This paper presents a family of mechanisms to avoid network congestion in wormhole networks. All of them need only local information, applying message throttling when it is required. The proposed mechanisms use different strategies to detect network congestion and also apply different corrective actions. The mechanisms are evaluated and compared for several network loads and topologies, noticeably improving network performance with high loads but without penalizing network behavior for low and medium traffic rates, where no congestion control is required.",
	doi = "10.1109/TPDS.2005.102",
	issn = "1045-9219",
	journal = "Parallel and Distributed Systems, IEEE Transactions on",
	keywords = "message throttling; multiprocessor interconnection network; network bandwidth; network congestion control; traffic load; wormhole network; wormhole switching; multiprocessor interconnection networks; telecommunication congestion control; telecommunicatio",
	month = "sept.",
	number = 9,
	pages = "772 - 784",
	title = "{A} family of mechanisms for congestion control in wormhole networks",
	url = "http://dx.doi.org/10.1109/TPDS.2005.102",
	volume = 16,
	year = 2005
}

Elvira Baydal, Pedro Lopez and Jose Duato. A family of mechanisms for congestion control in wormhole networks. IEEE Transactions on Parallel and Distributed Systems 16(9):772 - 84, 2005. URL BibTeX

@article{ 8570709,
	author = "Baydal, Elvira and Lopez, Pedro and Duato, Jose",
	abstract = "Multiprocessor interconnection networks may reach congestion with high traffic loads, which prevents reaching the wished performance. Unfortunately, many of the mechanisms proposed in the literature for congestion control either suffer from a lack of robustness, being unable to work properly with different traffic patterns or message lengths, or detect congestion relying on global information that wastes some network bandwidth. This paper presents a family of mechanisms to avoid network congestion in wormhole networks. All of them need only local information, applying message throttling when it is required. The proposed mechanisms use different strategies to detect network congestion and also apply different corrective actions. The mechanisms are evaluated and compared for several network loads and topologies, noticeably improving network performance with high loads but without penalizing network behavior for low and medium traffic rates, where no congestion control is required",
	address = "USA",
	issn = "1045-9219",
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	keywords = "multiprocessor interconnection networks;telecommunication congestion control;telecommunication network routing;telecommunication network topology;telecommunication switching;telecommunication traffic;",
	note = "multiprocessor interconnection network;traffic load;network congestion control;network bandwidth;wormhole network;message throttling;wormhole switching;",
	number = 9,
	pages = "772 - 84",
	title = "{A} family of mechanisms for congestion control in wormhole networks",
	url = "http://dx.doi.org/10.1109/TPDS.2005.102",
	volume = 16,
	year = 2005
}

Maria E Gomez, Pedro Lopez and Jose Duato. A Memory-Effective Fault-Tolerant Routing Strategy for Direct Interconnection Networks. In Parallel and Distributed Computing, 2005. ISPDC 2005. The 4th International Symposium on. July 2005, 341 -348. URL, DOI BibTeX

@conference{ 1609988,
	author = "Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "High-performance interconnection networks are crucial in massively parallel computers. Routing is one of the most important design issues of interconnection networks. Moreover, the huge amount of hardware of these machines makes fault-tolerance another important design issue. In this paper, we propose a mechanism that combines scalable routing and fault-tolerance for commercial switches to build direct regular topologies, which are the topologies used in large machines. The hardware required is not complex. Furthermore, it allows a high degree of fault-tolerance inflicting a minimal decrease of performance",
	booktitle = "Parallel and Distributed Computing, 2005. ISPDC 2005. The 4th International Symposium on",
	doi = "10.1109/ISPDC.2005.6",
	keywords = "adaptive routing;direct interconnection networks;distributed routing;memory-effective fault-tolerant routing;fault tolerance;multiprocessor interconnection networks;telecommunication network reliability;telecommunication network routing;",
	month = "july",
	pages = "341 -348",
	title = "{A} {M}emory-{E}ffective {F}ault-{T}olerant {R}outing {S}trategy for {D}irect {I}nterconnection {N}etworks",
	url = "http://dx.doi.org/10.1109/ISPDC.2005.6",
	year = 2005
}

Maria E Gomez, Pedro Lopez and Jose Duato. A memory-effective fault-tolerant routing strategy for direct interconnection networks. 2005, 341 - 8. BibTeX

@conference{ 8762349,
	author = "Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "High-performance interconnection networks are crucial in massively parallel computers. Routing is one of the most important design issues of interconnection networks. Moreover, the huge amount of hardware of these machines makes fault-tolerance another important design issue. In this paper, we propose a mechanism that combines scalable routing and fault-tolerance for commercial switches to build direct regular topologies, which are the topologies used in large machines. The hardware required is not complex. Furthermore, it allows a high degree of fault-tolerance inflicting a minimal decrease of performance",
	address = "Los Alamitos, CA, USA",
	journal = "ISPDC 2005. The 4th International Workshop on Parallel and Distributed Computing",
	keywords = "fault tolerance;multiprocessor interconnection networks;telecommunication network reliability;telecommunication network routing;",
	note = "memory-effective fault-tolerant routing;direct interconnection networks;distributed routing;adaptive routing;",
	pages = "341 - 8",
	title = "{A} memory-effective fault-tolerant routing strategy for direct interconnection networks",
	year = 2005
}

Maria E Gomez, Pedro Lopez and Jose Duato. A Memory-Effective Routing Strategy for Regular Interconnection Networks. In Parallel and Distributed Processing Symposium, 2005. Proceedings. 19th IEEE International. April 2005, 41b - 41b. URL, DOI BibTeX

@conference{ 1419862,
	author = "Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "Massively parallel computing systems have been or are being built with thousands of nodes. In such systems, high-performance interconnection networks are crucial to achieve the maximum performance. Routing is one of the most important design issues of interconnection networks. Routing strategies can be mainly classified as source and distributed routing. Source routing has been used in some networks because routers are very simple. On the other hand, distributed routing allows more flexibility, but the routers are more complex. Distributed routing can be implemented by a fixed hardware specific to a routing function on a given topology, or by using forwarding tables that are very flexible but suffer from a lack of scalability. In this paper, we propose a distributed routing strategy for commercial switches, Flexible Interval Routing, that is scalable for the most widely used regular topologies (tori and meshes) because it is not based on tables. At the same time, the strategy is easy to reconfigure to deal with changes in the topology or in the routing algorithm for a given topology, being able to implement the most commonly-used routing algorithms in regular topologies.",
	booktitle = "Parallel and Distributed Processing Symposium, 2005. Proceedings. 19th IEEE International",
	doi = "10.1109/IPDPS.2005.44",
	keywords = "distributed routing; flexible interval routing; high-performance interconnection networks; memory-effective routing strategy; parallel computing system; multiprocessor interconnection networks; network routing; parallel machines; performance evaluation;",
	month = "april",
	pages = "41b - 41b",
	title = "{A} {M}emory-{E}ffective {R}outing {S}trategy for {R}egular {I}nterconnection {N}etworks",
	url = "http://dx.doi.org/10.1109/IPDPS.2005.44",
	year = 2005
}

Maria E Gomez, Pedro Lopez and Jose Duato. A memory-effective routing strategy for regular interconnection networks. 2005, 41 -. BibTeX

@conference{ 2005509538034,
	author = "Gomez, Maria E. and Lopez, Pedro and Duato, Jose",
	abstract = "Massively parallel computing systems are being built with thousands of nodes. In such systems, high-performance inter-connection networks are crucial to achieve the maximum performance. Routing is one of the most important design issues of interconnection networks. Routing strategies can be mainly classified as source and distributed routing. Source routing has been used in some networks because routers are very simple. On the other hand, distributed routing allows more flexibility, but the routers are more complex. Distributed routing can be implemented by a fixed hardware specific to a routing function on a given topology, or by using forwarding tables that are very flexible but suffer from a lack of scalability. In this paper, we propose a distributed routing strategy for commercial switches, Flexible Interval Routing, that is scalable for the most widely used regular topologies (tori and meshes) because it is not based on tables. At the same time, the strategy is easy to reconfigure to deal with changes in the topology or in the routing algorithm for a given topology, being able to implement the most commonly-used routing algorithms in regular topologies.",
	address = "Denver, CO, United states",
	journal = "Proceedings - 19th IEEE International Parallel and Distributed Processing Symposium",
	key = "Interconnection networks",
	keywords = "Algorithms;Computer hardware;Data storage equipment;Parallel processing systems;Routers;Switches;Topology;",
	note = "Distributed routing;Routing algorithms;Routing strategies;Source routing;",
	pages = "41 -",
	title = "{A} memory-effective routing strategy for regular interconnection networks",
	year = 2005
}

P Morillo, J M Orduna, M Fernandez and Jose Duato. A method for providing QoS in distributed virtual environments. 2005, 152 - 9. BibTeX

@conference{ 8486327,
	author = "P. Morillo and J.M. Orduna and M. Fernandez and Duato, Jose",
	abstract = "One of the key issues in the design of scalable and cost-effective distributed virtual environment systems is the partitioning problem. It consists of efficiently assigning clients (3D avatars) to the servers in the system, and some proposed methods allow to significantly increase system throughput. However, these methods are not focused on satisfying any specific time constraint. In this paper, we show that the problem of providing quality of service in distributed virtual environment systems can be addressed by means of the partitioning method. Additionally, we propose a partitioning method that not only provides a high system throughput, but it also satisfies (if possible) any time constraint that avatars can require. This method is based on a heuristic search technique that looks for the best trade-off between system latency, system throughput and partitioning efficiency. The evaluation results show that this partitioning method allows to greatly increase the number of avatars provided with quality of service while also providing the highest system throughput as possible",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings. 13th Euromicro Conference on Parallel, Distributed and Network-based Processing",
	keywords = "client-server systems;quality of service;search problems;virtual reality;",
	note = "distributed virtual environments;quality of service;partitioning method;heuristic search technique;",
	pages = "152 - 9",
	title = "{A} method for providing {Q}o{S} in distributed virtual environments",
	year = 2005
}

Jose Duato, I Johnson, Jose Flich, F Naven, P Garcia and Teresa Nachiondo. A new scalable and cost-effective congestion management strategy for lossless multistage interconnection networks. In High-Performance Computer Architecture, 2005. HPCA-11. 11th International Symposium on. February 2005, 108 - 119. URL, DOI BibTeX

@conference{ 1385933,
	author = "Duato, Jose and I. Johnson and Flich, Jose and F. Naven and P. Garcia and Nachiondo, Teresa",
	abstract = "In this paper, we propose a new congestion management strategy for lossless multistage interconnection networks that scales as network size and/or link bandwidth increase. Instead of eliminating congestion, our strategy avoids performance degradation beyond the saturation point by eliminating the HOL blocking produced by congestion trees. This is achieved in a scalable manner by using separate queues for congested flows. These are dynamically allocated only when congestion arises, and deallocated when congestion subsides. Performance evaluation results show that our strategy responds to congestion immediately and completely eliminates the performance degradation produced by HOL blocking while using only a small number of additional queues.",
	booktitle = "High-Performance Computer Architecture, 2005. HPCA-11. 11th International Symposium on",
	doi = "10.1109/HPCA.2005.1",
	isbn = "0-7695-2275-0",
	issn = "1530-0897",
	keywords = "HOL blocking; congestion management; congestion trees; lossless multistage interconnection networks; network queue; computer network management; multistage interconnection networks; queueing theory; telecommunication congestion control;",
	month = "Feb",
	pages = "108 - 119",
	title = "{A} new scalable and cost-effective congestion management strategy for lossless multistage interconnection networks",
	url = "http://dx.doi.org/10.1109/HPCA.2005.1",
	year = 2005
}

S Rueda, P Morillo, J M Orduna and Jose Duato. A sexual elitist genetic algorithm for providing QoS in distributed virtual environment systems. 2005, 8 pp. -. BibTeX

@conference{ 8548873,
	author = "S. Rueda and P. Morillo and J.M. Orduna and Duato, Jose",
	abstract = "Architectures based on networked servers have become a de-facto standard for distributed virtual environment (DVE) systems. These systems allow a large number of remote users to share a single 3D virtual scene. In order to provide quality of service in a DVE system, clients should be assigned to servers taking into account system throughput and system latency. This highly complex problem is known as the quality of service (QoS) problem. This paper proposes an elitist sexual genetic algorithm for solving the QoS problem in distributed virtual environment systems. Performance evaluation results show that, due to its ability of both finding good search paths and keeping diversity escaping from local minima, this nature inspired technique can provide significantly better solutions than other heuristic methods with shorter execution times. Therefore, the proposed implementation of GA search method can improve the QoS offered by DVE systems",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings. 19th IEEE International Parallel and Distributed Processing Symposium",
	keywords = "client-server systems;genetic algorithms;quality of service;search problems;virtual reality;",
	note = "sexual elitist genetic algorithm;QoS;quality of service;distributed virtual environment system;networked server;client-server system;performance evaluation;search path;",
	pages = "8 pp. -",
	title = "{A} sexual elitist genetic algorithm for providing {Q}o{S} in distributed virtual environment systems",
	year = 2005
}

S Rueda, P Morillo, J M Orduna and Jose Duato. A sexual elitist genetic algorithm for providing QoS in distributed virtual environment systems. 2005, IEEE Computer Societ. URL BibTeX

@conference{ 20063010031238,
	author = "S. Rueda and P. Morillo and J.M. Orduna and Duato, Jose",
	abstract = "Architectures based on networked servers have become a de-facto standard for Distributed Virtual Environment (DVE) systems. These systems allow a large number of remote users to share a single 3D virtual scene. In order to provide quality of service in a DVE system, clients should be assigned to servers taking into account system throughput and system latency. This highly complex problem is known as the quality of service (QoS) problem. This paper proposes an elitist sexual genetic algorithm for solving the QoS problem in Distributed Virtual Environment systems. Performance evaluation results show that, due to its ability of both finding good search paths and keeping diversity escaping from local minima, this nature inspired technique can provide significantly better solutions than other heuristic methods with shorter execution times. Therefore, the proposed implementation of GA search method can improve the QoS offered by DVE systems.",
	address = "Denver, CO, United states",
	journal = "Proceedings - 19th IEEE International Parallel and Distributed Processing Symposium, IPDPS 2005",
	key = "Distributed computer systems",
	keywords = "Genetic algorithms;Problem solving;Quality of service;Servers;Virtual reality;",
	note = "Distributed Virtual Environment (DVE) systems;Execution times;System latency;Virtual scene;",
	pages = "IEEE Computer Societ",
	title = "{A} sexual elitist genetic algorithm for providing {Q}o{S} in distributed virtual environment systems",
	url = "http://dx.doi.org/10.1109/IPDPS.2005.67",
	volume = 2005,
	year = 2005
}

Manuel E Acacio, Jose Gonzalez, Jose M Garcia and Jose Duato. A two-level directory architecture for highly scalable cc-NUMA multiprocessors. IEEE Transactions on Parallel and Distributed Systems 16(1):67 - 79, 2005. URL BibTeX

@article{ 2005078834390,
	author = "Manuel E. Acacio and Jose Gonzalez and Jose M. Garcia and Duato, Jose",
	abstract = "One important issue the designer of a scalable shared-memory multiprocessor must deal with is the amount of extra memory required to store the directory information. It is desirable that the directory memory overhead be kept as low as possible, and that it scales very slowly with the size of the machine. Unfortunately, current directory architectures provide scalability at the expense of performance. This work presents a scalable directory architecture that significantly reduces the size of the directory for large-scale configurations of a multiprocessor without degrading performance. First, we propose multilayer clustering as an effective approach to reduce the width of directory entries. Based on this concept, we derive three new compressed sharing codes, some of them with a space complexity of O(log 2(log2(N))) for an N-node system. Then, we present a novel two-level directory architecture to eliminate the penalty caused by compressed directories in general. The proposed organization consists of a small full-map first-level directory (which provides precise information for the most recently referenced lines) and a compressed second-level directory (which provides in-excess information for all the lines). The proposals are evaluated based on extensive execution-driven simulations (using RSIM) of a 64-node cc-NUMA multiprocessor. Results demonstrate that a system with a two-level directory architecture achieves the same performance as a multiprocessor with a big and nonscalable full-map directory, with a very significant reduction of the memory overhead. © 2005 IEEE.",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Multiprocessing systems",
	keywords = "Cache memory;Computer architecture;Computer simulation;Network protocols;Optimization;",
	note = "Compressed sharing codes;Directory memory overhead;Shared memory multiprocessor;Two level directory architecture;Unnecessary coherence messages;",
	number = 1,
	pages = "67 - 79",
	title = "{A} two-level directory architecture for highly scalable cc-{NUMA} multiprocessors",
	url = "http://dx.doi.org/10.1109/TPDS.2005.4",
	volume = 16,
	year = 2005
}

O Lysne, T M Pinkston and Jose Duato. {A. IEEE Transactions on Parallel and Distributed Systems (5):428 - 43. BibTeX

@article{ 8399917,
	author = "O. Lysne and T.M. Pinkston and Duato, Jose",
	abstract = "For pt.I see ibid., vol.16, no.5, p.412-427 (2005). Dynamic network reconfiguration is defined as the process of changing from one routing function to another while the network remains up and running. The main challenge is in avoiding deadlock anomalies while keeping restrictions on packet injection and forwarding minimal. Current approaches either require virtual channels in the network or they work only for a limited set of routing algorithms and/or fault patterns. In this paper, we present a methodology for devising deadlock free and dynamic transitions between old and new routing functions that is consistent with newly proposed theory [J. Duato et al., (2005)]. The methodology is independent of topology, can be applied to any deadlock-free routing function, and puts no restrictions on the routing function changes that can be supported. Furthermore, it does not require any virtual channels to guarantee deadlock freedom. This research is motivated by current trends toward using increasingly larger Internet and transaction processing servers based on clusters of PCs that have very high availability and dependability requirements, as well as other local, system, and storage area network-based computing systems",
	address = "USA",
	issn = "1045-9219",
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	keywords = "Internet;network routing;network servers;reconfigurable architectures;storage area networks;system recovery;transaction processing;workstation clusters;",
	note = "dynamic network reconfiguration;routing algorithm;fault pattern;deadlock free dynamic transition;deadlock-free routing function;virtual channel;Internet;transaction processing server;clusters PC;storage area network-based computing system;",
	number = 5,
	pages = "428 - 43",
	title = "{A"
}

Jose Duato, O Lysne, R Pang and T M Pinkston. {A. IEEE Transactions on Parallel and Distributed Systems (5):412 - 27. BibTeX

@article{ 8399916,
	author = "Duato, Jose and O. Lysne and R. Pang and T.M. Pinkston",
	abstract = "This paper develops theoretical support useful for determining deadlock properties of dynamic network reconfiguration techniques and also serves as a basis for the development of design methodologies useful for deriving deadlock-free reconfiguration techniques. It is applicable to interconnection networks typically used in multiprocessor servers, network-based computing clusters, and distributed storage systems, and also has potential application to system-on-chip networks. This theory builds on basic principles established by previous theories while pioneering new concepts fundamental to the case of dynamic network reconfiguration",
	address = "USA",
	issn = "1045-9219",
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	keywords = "multiprocessing systems;multiprocessor interconnection networks;reconfigurable architectures;reliability;system recovery;",
	note = "deadlock-free dynamic network reconfiguration;interconnection network;multiprocessor server;network-based computing cluster;distributed storage system;system-on-chip network;system reliability;system availability;",
	number = 5,
	pages = "412 - 27",
	title = "{A"
}

I J Nino, B Ossa, J A Gil, Julio Sahuquillo and A Pont. CARENA: a tool to capture and replay Web navigation sessions. In End-to-End Monitoring Techniques and Services, 2005. Workshop on. May 2005, 127 - 141. URL, DOI BibTeX

@conference{ 1564474,
author = "I.J. Nino and de la Ossa, B. and J.A. Gil and Sahuquillo, Julio and A. Pont",
abstract = "Web user behavior has widely changed over the last years. To perform precise and up-to-date Web user behavior characterization is important to carry out representative Web performance studies. In this sense, it is valuable to capture detailed information about the user's experience, which permits to perform a fine grain characterization. Two main types of tools are distinguishable: complex commercial software tools like workload generators and academic tools. The latter mainly concentrate on the development of windows applications which gather Web events (e.g., browser events) or tools modifying a part of the web browser rode. In this paper, we present CARENA, a client-side browser-embedded tool to capture and replay user navigation sessions. Like some commercial software packages our tool captures information about the user session, which can be used later to replay or mimic the gathered user navigation. Nevertheless, unlike these software packages, our tool emulates the original user think times since these times are important to obtain precise and reliable performance results. Among the main features of CARENA are: multiplatform, open source, lightweight, standards based, easily installable and usable, programmed in JavaScript and XUL.",
booktitle = "End-to-End Monitoring Techniques and Services, 2005. Workshop on",
doi = "10.1109/E2EMON.2005.1564474",
isbn = "0-7803-9249-3",
keywords = "Web navigation sessions; client-side browser-embedded tool; user navigation sessions; Internet; online front-ends; software tools;",
month = "may",
pages = "127 - 141",
title = "{CARENA}: a tool to capture and replay {W}eb navigation sessions",
url = "http://dx.doi.org/10.1109/E2EMON.2005.1564474",
year = 2005
}

Teresa Nachiondo, Jose Flich, Jose Duato and M Gusat. Cost/performance trade-offs and fairness evaluation of queue mapping policies. In José Cunha; Pedro C D Medeiros (ed.). Euro-Par 2005 Parallel Processing 3648. August 2005, 1024 - 1034. URL, DOI BibTeX

@conference{ 8746125,
	author = "Nachiondo, Teresa and Flich, Jose and Duato, Jose and M. Gusat",
	abstract = "Whereas the established interconnection networks (ICTN) achieve low latency by operating in the linear region, i.e. oversizing the fabric, the strict cost and power constraints demand more efficient utilization of future networks. Increasing the utilization of lossless ICTNs may, however, lead to saturation and performance degradation owing to HOL-blocking. The current solution to HOL-blocking consists of using virtual output queueing (VOQ), whose quadratical scalability is expensive in large networks. To improve VOQ's scalability we have proposed the destination-based buffer management (DBBM), a scheme that compares well with VOQ. Whereas previously we have analyzed DBBM's basic operation and performance, in this paper we have set two different goals. First we focus on how the different DBBM mappings can impact the cost/performance of multistage ICTNs. Next, because DBBM can introduce unfairness, this constitutes the second theme of our paper. The new results show that DBBM with modulo-4/8 mapping performs very well for only a fraction of the VOQ cost. Also in terms of fairness DBBM shows promise, because it (i) keeps the unfairness degree independent of both topology and routing, while (ii) minimizing the number of flows affected by unfairness",
	booktitle = "Euro-Par 2005 Parallel Processing",
	doi = "10.1007/11549468_112",
	editor = "Jos{\'e} C. Cunha; Pedro D. Medeiros",
	isbn = "978-3-540-28700-1",
	journal = "Euro-Par 2005 Parallel Processing. 11th International Euro-Par Conference. Proceedings (Lecture Notes in Computer Science Vol. 3648)",
	keywords = "buffer storage;multistage interconnection networks;performance evaluation;queueing theory;",
	month = "Aug",
	note = "fairness evaluation;queue mapping policies;interconnection networks;destination-based buffer management;multistage ICTN;",
	pages = "1024 - 1034",
	series = "Lecture Notes in Computer Science",
	title = "{C}ost/performance trade-offs and fairness evaluation of queue mapping policies",
	url = "http://dx.doi.org/10.1007/11549468_112",
	volume = 3648,
	year = 2005
}

P J Garcia, Jose Flich, Jose Duato, I Johnson, F J Quiles and F Naven. Dynamic evolution of congestion trees: Analysis and impact on switch architecture. 2005, 266 - 285. BibTeX

@conference{ 2006229908739,
	author = "P.J. Garcia and Flich, Jose and Duato, Jose and I. Johnson and F.J. Quiles and F. Naven",
	abstract = "Designers of large parallel computers and clusters are becoming increasingly concerned with the cost and power consumption of the interconnection network. A simple way to reduce them consists of reducing the number of network components and increasing their utilization. However, doing so without a suitable congestion management mechanism may lead to dramatic throughput degradation when the network enters saturation. Congestion management strategies for lossy networks (computer networks) are well known, but relatively little effort has been devoted to congestion management in lossless networks (parallel computers, clusters, and on-chip networks). Additionally, congestion is much more difficult to solve in this context due to the formation of congestion trees. In this paper we study the dynamic evolution of congestion trees. We show that, contrary to the common belief, trees do not only grow from the root toward the leaves. There exist cases where trees grow from the leaves to the root, cases where several congestion trees grow independently and later merge, and even cases where some congestion trees completely overlap while being independent. This complex evolution and its implications on switch architecture are analyzed, proposing enhancements to a recently proposed congestion management mechanism and showing the impact on performance of different design decisions. {{\&}}copy; Springer-Verlag Berlin Heidelberg 2005.",
	address = "Barcelona, Spain",
	issn = 03029743,
	journal = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
	key = "Trees (mathematics)",
	keywords = "Computer networks;Congestion control (communication);Interconnection networks;Network components;Switching theory;Throughput;",
	note = "Congestion management;Congestion trees;Lossless networks;Throughput degradation;",
	pages = "266 - 285",
	title = "{D}ynamic evolution of congestion trees: {A}nalysis and impact on switch architecture",
	volume = "3793 LNCS",
	year = 2005
}

Teresa Nachiondo, Jose Flich and Jose Duato. Efficient reduction of HOL blocking in multistage networks. In Parallel and Distributed Processing Symposium, 2005. Proceedings. 19th IEEE International. April 2005, 8 pp.. URL, DOI BibTeX

@conference{ 1420115,
	author = "Nachiondo, Teresa and Flich, Jose and Duato, Jose",
	abstract = "Head-of-line blocking is one of the main problems arising in input-buffered switches. The best-known solution to this problem consists of using virtual output queues (VOQs). However this strategy is not scalable. Its implementation cost increases quadratically with the number of ports in the switch. Taking into account current trends, the demand for larger number of ports in high-performance switches is likely to increase very rapidly in the future. Therefore, a scalable and cost-effective solution is required. In this paper we propose an efficient and cost-effective strategy (belonging to a family of strategies previously proposed, referred to as destination-based buffer management (DBBM)), to reduce HOL blocking in single-stage and multistage networks. The proposed strategy is based on allowing certain destinations to share the same queue. Its main purpose is to maximize network throughput whereas keeping HOL blocking to negligible values. In this paper, we apply the strategy at every switch included in a bidirectional multistage network (BMIN). We have evaluated DBBM, VOQ, and alternative strategies in different BMIN sizes and with different traffic conditions (synthetic traffic, IP traces, and I/O traces). Results show that DBBM with a reduced number of queues at each switch obtains roughly the same throughput as the VOQ mechanism. Moreover, VOQ at the switch level (as many queues as output ports at every switch) has also been analyzed. Results demonstrate that it does not scale. As the number of stages in the network increases, the VOQ solution at the switch level introduces more HOL blocking that leads to a severe degradation in network throughput. With the DBBM using 16 queues, maximum network throughput is sustained for all the traffic cases analyzed. Moreover, as the network size increases (up to a 2048 times; 2048 BMIN), DBBM keeps roughly the same performance with the same number of queues.",
	booktitle = "Parallel and Distributed Processing Symposium, 2005. Proceedings. 19th IEEE International",
	doi = "10.1109/IPDPS.2005.193",
	isbn = "0-7695-2312-9",
	keywords = "bidirectional multistage network; destination-based buffer management; head-of-line blocking; high-performance switch; virtual output queue; multistage interconnection networks; queueing theory; storage management; telecommunication traffic;",
	month = "April",
	pages = "8 pp.",
	title = "{E}fficient reduction of {HOL} blocking in multistage networks",
	url = "http://dx.doi.org/10.1109/IPDPS.2005.193",
	year = 2005
}

L G Cardenas, J A Gil, Julio Sahuquillo and A Pont. Emulating Web cache replacement algorithms versus a real system. In Computers and Communications, 2005. ISCC 2005. Proceedings. 10th IEEE Symposium on. June 2005, 891 - 897. URL, DOI BibTeX

@conference{ 1493829,
	author = "L.G. Cardenas and J.A. Gil and Sahuquillo, Julio and A. Pont",
	abstract = "This paper presents a powerful framework to simulate Web proxy cache systems. Our tool provides a comfortable environment to simulate and explore cache management techniques. It also includes an extension to design and simulate new structures considering several inter-connected caches which are very convenient for our current research projects. Besides a statistics module is enlarged to obtain supplementary performance measures. We compared the results obtained from our framework against a commercial proxy cache system by using several replacement algorithms and input traces. Experimental results show that proxy cache hit ratio deviations fall very close to the real system, since them never exceeds 3.5%. Although the simulation time varies depending on the input trace size and the modeled management technique, in all experiments run time has been by about several hundred times faster than the time the real system takes.",
	booktitle = "Computers and Communications, 2005. ISCC 2005. Proceedings. 10th IEEE Symposium on",
	doi = "10.1109/ISCC.2005.63",
	isbn = "0-7695-2373-0",
	keywords = "Web cache replacement algorithms; Web proxy cache systems; cache management techniques; statistics module; Internet; cache storage;",
	month = "june",
	pages = "891 - 897",
	title = "{E}mulating {W}eb cache replacement algorithms versus a real system",
	url = "http://dx.doi.org/10.1109/ISCC.2005.63",
	year = 2005
}

Michihiro Koibuchi, Juan Carlos Martinez, Jose Flich, Antonio Robles, Pedro Lopez and Jose Duato. Enforcing in-order packet delivery in system area networks with adaptive routing. Journal of Parallel and Distributed Computing 65(10):1223 - 1236, 2005. URL BibTeX

@article{ 2005379355213,
	author = "Michihiro Koibuchi and Martinez, Juan Carlos and Flich, Jose and Robles, Antonio and Lopez, Pedro and Duato, Jose",
	abstract = "Adaptive routing, which dynamically selects the route of packets, has been widely studied for interconnection networks in massively parallel computers and system area networks. Although adaptive routing has the advantage of providing high bandwidth, it may deliver packets out-of-order, which some message passing libraries do not accept. In this paper, we propose two mechanisms called (1) FIFO transmission and (2) couple limitation to guarantee in-order packet delivery in adaptive routing. Both of them limit packet injection at source hosts. The FIFO transmission completely avoids packet sorting at destination hosts, while the couple limitation uses a few buffers to sort packets at destination hosts. Evaluation results show that the FIFO transmission and the couple limitation achieve a similar throughput to that of a method equipped with huge (infinite) buffers enough to store all out-of-order packets at destination hosts under both synthetic traffic and NAS Parallel Benchmarks. © 2005 Elsevier Inc. All rights reserved.",
	issn = 07437315,
	journal = "Journal of Parallel and Distributed Computing",
	key = "Packet networks",
	keywords = "Bandwidth;Benchmarking;Interconnection networks;Routers;Telecommunication traffic;",
	note = "Adaptive routing;In-order packet delivery;PC clusters;System area networks;",
	number = 10,
	pages = "1223 - 1236",
	title = "{E}nforcing in-order packet delivery in system area networks with adaptive routing",
	url = "http://dx.doi.org/10.1016/j.jpdc.2005.04.007",
	volume = 65,
	year = 2005
}

Pedro Morillo, Juan M Orduna, Marcos Fernandez and Jose Duato. Improving the performance of distributed virtual environment systems. IEEE Transactions on Parallel and Distributed Systems 16(7):637 - 649, 2005. URL BibTeX

@article{ 2005329281291,
author = "Pedro Morillo and Juan M. Orduna and Marcos Fernandez and Duato, Jose",
abstract = "The last years have witnessed a dramatic growth in the number as well as in the variety of distributed virtual environment systems. These systems allow multiple users, working on different client computers that are interconnected through different networks, to interact in a shared virtual world. One of the key issues in the design of scalable and cost-effective DVE systems is the partitioning problem. This problem consists of efficiently assigning the existing clients to the servers in the system and some techniques have been already proposed for solving it. This paper experimentally analyzes the correlation of the quality function proposed in the literature for solving the partitioning problem with the performance of DVE systems. Since the results show an absence of correlation, we also propose the experimental characterization of DVE systems. The results show that the reason for that absence of correlation is the nonlinear behavior of DVE systems with regard to the number of clients in the system. DVE systems reach saturation when any of the servers reaches 100 percent of CPU utilization. The system performance greatly decreases if this limit is exceeded in any server. Also, as a direct application of these results, we present a partitioning method that is targeted to keep all the servers in the system below a certain threshold value of CPU utilization, regardless of the amount of network traffic. Evaluation results show that the proposed partitioning method can improve DVE system performance, regardless of both the movement pattern of clients and the initial distribution of clients in the virtual world. {{\&}}copy; 2005 IEEE.",
issn = 10459219,
journal = "IEEE Transactions on Parallel and Distributed Systems",
key = "Distributed computer systems",
keywords = "Computer simulation;Correlation methods;Evaluation;Performance;Servers;Telecommunication traffic;Virtual reality;",
note = "Distributed applications;Distributed network graphics;",
number = 7,
pages = "637 - 649",
title = "{I}mproving the performance of distributed virtual environment systems",
url = "http://dx.doi.org/10.1109/TPDS.2005.83",
volume = 16,
year = 2005
}

Juan Carlos Martinez, Jose Flich, Antonio Robles, Pedro Lopez, Jose Duato and M Koibuchi. In-Order Packet Delivery in Interconnection Networks using Adaptive Routing. In Parallel and Distributed Processing Symposium, 2005. Proceedings. 19th IEEE International. 2005, 101 - 101. DOI BibTeX

@conference{ 1419928,
	author = "Martinez, Juan Carlos and Flich, Jose and Robles, Antonio and Lopez, Pedro and Duato, Jose and M. Koibuchi",
	abstract = "Most commercial switch-based network technologies for PC clusters use deterministic routing. Alternatively, adaptive routing could be used to improve network performance. In this case, switches decide the path to reach the destination by using local information about the state of the possible outgoing links. However, there are two drawbacks that discourage adaptive routing from being applied to commercial interconnects. The first one concerns the possible switch complexity increase with respect to deterministic routing. The second drawback is due to the fact that adaptive routing may introduce out-of-order packet delivery, which is not acceptable for some applications. For the best of our knowledge, there are no works that analyze the degree of out-of-order packet delivery caused by different network and traffic conditions. In this paper, we take on such a challenge. We show that only for high traffic conditions (reaching saturation) out-of-order delivery is introduced. Moreover, by using small buffers and simple sorting mechanisms at destination, we show that high network throughput can be obtained at the same time packets are delivered in order. Thus, the paper demonstrates that it is possible to use adaptive routing, while still guaranteeing in-order packet delivery, without using large buffer resources nor degrading significantly its performance.",
	booktitle = "Parallel and Distributed Processing Symposium, 2005. Proceedings. 19th IEEE International",
	doi = "10.1109/IPDPS.2005.255",
	keywords = "PC clusters; adaptive routing; deterministic routing; interconnection networks; out-of-order packet delivery; sorting mechanisms; switch-based network technologies; multiprocessor interconnection networks; network routing; packet switching; sorting; work",
	month = "04-08",
	pages = "101 - 101",
	title = "{I}n-{O}rder {P}acket {D}elivery in {I}nterconnection {N}etworks using {A}daptive {R}outing",
	year = 2005
}

Wu-Chun Feng and Jose Duato. Message from the program co-chairs. Proceedings of the International Conference on Parallel Processing 2005:xii - xii, 2005. URL BibTeX

@article{ 2006259954343,
	author = "Wu-Chun Feng and Duato, Jose",
	abstract = "No abstract available",
	address = "Oslo, Norway",
	issn = 01903918,
	journal = "Proceedings of the International Conference on Parallel Processing",
	pages = "xii - xii",
	title = "{M}essage from the program co-chairs",
	url = "http://dx.doi.org/10.1109/ICPP.2005.52",
	volume = 2005,
	year = 2005
}

P J Garcia, Jose Flich, Jose Duato, F J Quiles, I Johnson and F Naven. On the correct sizing on meshes through an effective congestion management strategy. 2005, 1035 - 45. BibTeX

@conference{ 8746126,
	author = "P.J. Garcia and Flich, Jose and Duato, Jose and F.J. Quiles and I. Johnson and F. Naven",
	abstract = "Interconnection networks used in clusters of PCs are often dimensioned with certain restrictions. One restriction could be the reduction of power consumption and overall cost. In this sense, the network size must be reduced. Another restriction is to guarantee that the system offers a minimum bandwidth. In this case, the network size must be increased. In both cases, the head-of-line (HOL) blocking effect (related to network congestion) may appear, degrading network performance and thus, preventing the correct sizing of the network. Therefore, some mechanisms should be implemented for reducing or eliminating this problem, in order to dimension the network as desired while keeping network performance at maximum. In this paper we analyze the impact on network performance when using different mechanisms for handling HOL blocking when interconnection networks with mesh topology are dimensioned in several ways. We show that the previously proposed RECN congestion control mechanism is key in order to efficiently eliminate HOL blocking in meshes and, therefore, it allows the correct network sizing",
	address = "Berlin, Germany",
	journal = "Euro-Par 2005 Parallel Processing. 11th International Euro-Par Conference. Proceedings (Lecture Notes in Computer Science Vol. 3648)",
	keywords = "computer network management;multiprocessor interconnection networks;performance evaluation;telecommunication congestion control;",
	note = "mesh network sizing;congestion management;interconnection networks;head-of-line blocking reduction;HOL blocking handling;RECN congestion control;",
	pages = "1035 - 45",
	title = "{O}n the correct sizing on meshes through an effective congestion management strategy",
	year = 2005
}

José Cano Reyes, J -C Cano, P Manzoni and D Ferrandez. On the design of spontaneous networks using a P2P approach and Bluetooth. In Computers and Communications, 2005. ISCC 2005. Proceedings. 10th IEEE Symposium on. June 2005, 125 - 130. URL, DOI BibTeX

@conference{ 1493717,
	author = "Cano Reyes, Jos{\'e} and J. -C. Cano and P. Manzoni and D. Ferrandez",
	abstract = "In this paper, we address the design of spontaneous networks using a peer to peer (P2P) and Bluetooth technology. A spontaneous network is a small infrastructureless network formed when a group of people come together to participate in some collaborative activity. In this work, we develop a base library and application work ground for easy spontaneous networks development. Based on it, we present an experimental application that provides spontaneous networks with context and transparent services to interchange resources between peers. We describe the overall network architecture and present details of the implementation steps taken to create our P2P and Bluetooth based application. Finally we run some experiments in a small testbed to evaluate the performance and system behaviour. We present our findings in term of the duration of the inquiry procedure and throughput performance with respect to distance and node speed.",
	booktitle = "Computers and Communications, 2005. ISCC 2005. Proceedings. 10th IEEE Symposium on",
	doi = "10.1109/ISCC.2005.110",
	issn = "1530-1346",
	keywords = "Bluetooth; P2P approach; peer to peer technology; spontaneous network; Bluetooth; peer-to-peer computing;",
	month = "june",
	pages = "125 - 130",
	title = "{O}n the design of spontaneous networks using a {P}2{P} approach and {B}luetooth",
	url = "http://dx.doi.org/10.1109/ISCC.2005.110",
	year = 2005
}

Jose Duato, Olav Lysne, Ruoming Pang and Timothy M Pinkston. Part I: A theory for deadlock-free dynamic network reconfiguration. IEEE Transactions on Parallel and Distributed Systems 16(5):412 - 427, 2005. URL BibTeX

@article{ 2005259162434,
	author = "Duato, Jose and Olav Lysne and Ruoming Pang and Timothy M. Pinkston",
	abstract = "This paper develops theoretical support useful for determining deadlock properties of dynamic network reconfiguration techniques and also serves as a basis for the development of design methodologies useful for deriving deadlock-free reconfiguration techniques. It is applicable to interconnection networks typically used in multiprocessor servers, network-based computing clusters, and distributed storage systems, and also has potential application to system-on-chip networks. This theory builds on basic principles established by previous theories while pioneering new concepts fundamental to the case of dynamic network reconfiguration. {{\&}}copy; 2005 IEEE.",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Interconnection networks",
	keywords = "Communication channels (information theory);Computer system recovery;Data storage equipment;Servers;Theorem proving;",
	note = "Deadlock freedom theory;Dynamic reconfiguration;",
	number = 5,
	pages = "412 - 427",
	title = "{P}art {I}: {A} theory for deadlock-free dynamic network reconfiguration",
	url = "http://dx.doi.org/10.1109/TPDS.2005.58",
	volume = 16,
	year = 2005
}

Olav Lysne, Timothy Mark Pinkston and Jose Duato. Part II: A methodology for developing deadlock-free dynamic network reconfiguration processes. IEEE Transactions on Parallel and Distributed Systems 16(5):428 - 443, 2005. URL BibTeX

@article{ 2005259162435,
	author = "Olav Lysne and Timothy Mark Pinkston and Duato, Jose",
	abstract = "Dynamic network reconfiguration is defined as the process of changing from routing function to another while the network remains up and running. The main challenge is in avoiding deadlock anomalies while keeping restrictions on packet injection and forwarding minimal. Current approaches either require virtual channels in the network or they work only for a limited set of routing algorithms and/or fault patterns. In this paper, we present a methodology for devising deadlock free and dynamic transitions between old and new routing functions that is consistent with newly proposed theory [1]. The methodology is independent of topology, can be applied to any deadlock-free routing function, and puts no restrictions on the routing function changes that can be supported. Furthermore, it does not require any virtual channels to guarantee deadlock freedom. This research is motivated by current trends toward using increasingly larger Internet and transaction processing servers based on clusters of PCs that have very high availability and dependability requirements, as well as other local, system, and storage area network-based computing systems. {{\&}}copy; 2005 IEEE.",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Interconnection networks",
	keywords = "Algorithms;Communication channels (information theory);Computer system recovery;Computer systems;Internet;Routers;Servers;",
	note = "Deadlock freedom methodology;Dynamic reconfiguration;",
	number = 5,
	pages = "428 - 443",
	title = "{P}art {II}: {A} methodology for developing deadlock-free dynamic network reconfiguration processes",
	url = "http://dx.doi.org/10.1109/TPDS.2005.59",
	volume = 16,
	year = 2005
}

L G Cardenas, J A Gil, J Domenech, Julio Sahuquillo and A Pont. Performance comparison of a Web cache simulation framework. In Advanced Information Networking and Applications, 2005. AINA 2005. 19th International Conference on 2. March 2005, 281 - 284 vol.2. URL, DOI BibTeX

@conference{ 1423694,
	author = "L.G. Cardenas and J.A. Gil and J. Domenech and Sahuquillo, Julio and A. Pont",
	abstract = "Performance comparison studies are primarily carried out through real systems or simulation environments. Simulation is the most commonly used method to explore new proposals due to both its flexibility and the relatively reduced time taken to obtain performance results. This paper presents a powerful framework to simulate Web proxy cache systems. Our tool provides a comfortable environment to simulate and explore cache management techniques. In order to validate our framework and show how accurate it executes, a performance comparison has been done. We analyzed the details of a commercial proxy cache system and compare its results with those obtained from our simulator using the most commonly replacement algorithm (LRU). For this purpose, the proposed environment was adapted to match the performance of the real proxy cache. Experimental results show that proxy cache hit ratio deviations fall very close to the real system, since then, never exceeds 3.42%.",
	booktitle = "Advanced Information Networking and Applications, 2005. AINA 2005. 19th International Conference on",
	doi = "10.1109/AINA.2005.275",
	isbn = "0-7695-2249-1",
	keywords = "LRU; Web cache simulation; Web proxy cache system; cache management; performance comparison; real system; replacement algorithm; simulation environment; simulation techniques; Internet; cache storage; digital simulation;",
	month = "march",
	pages = "281 - 284 vol.2",
	title = "{P}erformance comparison of a {W}eb cache simulation framework",
	url = "http://dx.doi.org/10.1109/AINA.2005.275",
	volume = 2,
	year = 2005
}

Marina Alonso, Juan Miguel Martínez, Vicente Santonja, Pedro Lopez and Jose Duato. Power Saving in Regular Interconnection Networks Built with High-Degree Switches. In Parallel and Distributed Processing Symposium, 2005. Proceedings. 19th IEEE International. April 2005, 5b - 5b. URL, DOI BibTeX

@conference{ 1419820,
	author = "Alonso, Marina and Mart{\'i}nez, Juan Miguel and Santonja, Vicente and Lopez, Pedro and Duato, Jose",
	abstract = "Nowadays, high-degree switches are available as building blocks of the interconnection network of clusters of PCs. An alternative to take advantage of the high number of switch ports is to connect every pair of switches through not only one but several links (this is known as link trunking in other environments). This extra connectivity can be exploited by using adaptive routing algorithms, thus improving network throughput and reducing network congestion. However with low traffic loads, all the links that compose the trunk link will not be utilized, but this idle links continue consuming power. Power consumption reduction techniques are being applied everywhere in computer systems and the interconnection network is not an exception, as its contribution is not negligible. In this paper, we present a mechanism that dynamically switches on and off network links as a function of traffic. It is specially targeted to those networks where trunk links are used. The mechanism can switch off any link, provided that network connectivity is guaranteed, (i.e. every pair of switches should be connected through at least one active link). Indeed, this restriction makes possible to use the same routing algorithm regardless the power saving actions taken, thus simplifying router design. Our simulation results show that the network power consumption can be greatly reduced, at the expense of some increase in latency. Nevertheless, it is shown that the power reduction is always higher that this latency increase.",
	booktitle = "Parallel and Distributed Processing Symposium, 2005. Proceedings. 19th IEEE International",
	doi = "10.1109/IPDPS.2005.349",
	isbn = "0-7695-2312-9",
	keywords = "PC clusters; adaptive routing algorithm; high-degree switch; link trunking; network congestion; network link; network throughput; power consumption; power saving; regular interconnection network; telecommunication traffic; power consumption; telecommunic",
	month = "april",
	pages = "5b - 5b",
	title = "{P}ower {S}aving in {R}egular {I}nterconnection {N}etworks {B}uilt with {H}igh-{D}egree {S}witches",
	url = "http://dx.doi.org/10.1109/IPDPS.2005.349",
	year = 2005
}

Marina Alonso, Juan Miguel Martínez, Vicente Santonja, Pedro Lopez and Jose Duato. Power saving in regular interconnection networks built with high-degree switches. 2005, 10 pp. -. BibTeX

@conference{ 8539357,
	author = "Alonso, Marina and Mart{\'i}nez, Juan Miguel and Santonja, Vicente and Lopez, Pedro and Duato, Jose",
	abstract = "Nowadays, high-degree switches are available as building blocks of the interconnection network of clusters of PCs. An alternative to take advantage of the high number of switch ports is to connect every pair of switches through not only one but also several links (this is known as link trunking in other environments). This extra connectivity can be exploited by using adaptive routing algorithms, thus improving network throughput and reducing network congestion. However with low traffic loads, all the links that compose the trunk link will not be utilized, but this idle links continue consuming power. Power consumption reduction techniques are being applied everywhere in computer systems and the interconnection network is not an exception, as its contribution is not negligible. In this paper, we present a mechanism that dynamically switches on and off network links as a function of traffic. It is specially targeted to those networks where trunk links are used. The mechanism can switch off any link, provided that network connectivity is guaranteed, (i.e. every pair of switches should be connected through at least one active link). Indeed, this restriction makes possible to use the same routing algorithm regardless the power saving actions taken, thus simplifying router design. Our simulation results show that the network power consumption can be greatly reduced, at the expense of some increase in latency. Nevertheless, it is shown that the power reduction is always higher that this latency increases",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings. 19th IEEE International Parallel and Distributed Processing Symposium",
	keywords = "power consumption;telecommunication congestion control;telecommunication links;telecommunication network routing;telecommunication switching;telecommunication traffic;workstation clusters;",
	note = "power saving;regular interconnection network;high-degree switch;PC clusters;link trunking;adaptive routing algorithm;network throughput;network congestion;power consumption;telecommunication traffic;network link;",
	pages = "10 pp. -",
	title = "{P}ower saving in regular interconnection networks built with high-degree switches",
	year = 2005
}

. Proceedings. 2005 International Conference on Parallel Processing. 2005, xvii+649 -. BibTeX

@conference{ 8598848,
	author = "",
	abstract = "The following topics are dealt with: processor scheduling; resource allocation; overlay networks; processor architecture; program compilers; on-chip parallelism; message passing; virtual network; optical network; shared memory computing; peer-to-peer computing; ad-hoc network; network performance; grid computing; mobile computing; interconnection networks; cross-node clustering",
	address = "Los Alamitos, CA, USA",
	editor = "Feng, W.-c.;Duato, J.;",
	keywords = "ad hoc networks;grid computing;message passing;mobile computing;multiprocessor interconnection networks;optical fibre networks;parallel processing;processor scheduling;resource allocation;shared memory systems;system-on-chip;workstation clusters;",
	note = "processor scheduling;resource allocation;overlay networks;processor architecture;program compilers;on-chip parallelism;message passing;virtual network;optical network;shared memory computing;peer-to-peer computing;ad-hoc network;network performance;grid computing;mobile computing;interconnection networks;cross-node clustering;",
	pages = "xvii+649 -",
	title = "{P}roceedings. 2005 {I}nternational {C}onference on {P}arallel {P}rocessing",
	year = 2005
}

A Martinez, F J Alfaro, J L Sanchez and Jose Duato. Providing full QoS support in clusters using only two VCs at the switches. 2005, 158 - 169. BibTeX

@conference{ 2006219900053,
	author = "A. Martinez and F.J. Alfaro and J.L. Sanchez and Duato, Jose",
	abstract = "Current interconnect standards providing hardware support for quality of service (QoS) consider up to 16 virtual channels (VCs) for this purpose. However, most implementations do not offer so many VCs because they increase the complexity of the switch and the scheduling de-lays. In this paper, we show that this number of VCs can be significantly reduced. Some of the scheduling decisions made at network interfaces can be easily reused at switches without significantly altering the global behavior. Specifically, we show that it is enough to use two VCs for QoS purposes at each switch port, thereby simplifying the design and reducing its cost. {{\&}}copy; Springer-Verlag Berlin Heidelberg 2005.",
	address = "Goa, India",
	issn = 03029743,
	journal = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
	key = "Optical interconnects",
	keywords = "Computational complexity;Interfaces (computer);Quality of service;Scheduling;Standards;Virtual reality;",
	note = "de-lays;Hardware support;Virtual channels (VCs);",
	pages = "158 - 169",
	title = "{P}roviding full {Q}o{S} support in clusters using only two {VC}s at the switches",
	volume = "3769 LNCS",
	year = 2005
}

A Martinez, F J Alfaro, J L Sanchez and Jose Duato. Providing full QoS support in clusters using only two VCs at the switches. 2005, 158 - 69. BibTeX

@conference{ 8815927,
	author = "A. Martinez and F.J. Alfaro and J.L. Sanchez and Duato, Jose",
	abstract = "Current interconnect standards providing hardware support for quality of service (QoS) consider up to 16 virtual channels (VCs) for this purpose. However, most implementations do not offer so many VCs because they increase the complexity of the switch and the scheduling delays. In this paper, we show that this number of VCs can be significantly reduced. Some of the scheduling decisions made at network interfaces can be easily reused at switches without significantly altering the global behavior. Specifically, we show that it is enough to use two VCs for QoS purposes at each switch port, thereby simplifying the design and reducing its cost",
	address = "Berlin, Germany",
	journal = "High Performance Computing-HiPC 2005. 12th International Conference. Proceedings (Lecture Notes in Computer Science Vol.3769)",
	keywords = "network interfaces;processor scheduling;quality of service;workstation clusters;",
	note = "full QoS support;workstation clusters;quality of service;virtual channels;scheduling decision reuse;network interfaces;",
	pages = "158 - 69",
	title = "{P}roviding full {Q}o{S} support in clusters using only two {VC}s at the switches",
	year = 2005
}

R Martinez, J L Sanchez, F J Alfaro, Vicente Chirivella and Jose Flich. Studying the effect of the design parameters on the interconnection network performance in NOWs. In Parallel, Distributed and Network-Based Processing, 2005. PDP 2005. 13th Euromicro Conference on. February 2005, 102 - 109. URL, DOI BibTeX

@conference{ 1386048,
author = "R. Martinez and J.L. Sanchez and F.J. Alfaro and Chirivella, Vicente and Flich, Jose",
abstract = "With the increasing use of network of workstations (NOWs) as an alternative to huge parallel computers it has become essential to design high-performance interconnection networks for the communication between the nodes of these clusters. A large number of studies have been carried out to achieve this objective. Most of them propose a new technique that affects one of the parameters that characterize the interconnection network. These techniques are completely new or inspired in the techniques previously used in multiprocessor systems. The impact of the proposal is studied (in most cases using simulation), and an analysis is made of the effect of the new technique over the system performance versus those currently in existence. In this kind of study most of the network parameters are fixed and usually only a few parameters are varied. This paper presents a more general study of the interconnection network performance. This study consists in showing the effect of different design parameters over the network performance, and the interaction between them. This study would not be viable with the traditional techniques due to the number of simulations required. The alternative of the experimental design is used to carry out the study.",
booktitle = "Parallel, Distributed and Network-Based Processing, 2005. PDP 2005. 13th Euromicro Conference on",
doi = "10.1109/EMPDP.2005.40",
isbn = "0-7695-2280-7",
issn = "1066-6192",
keywords = "interconnection network performance; multiprocessor systems; network of workstations; network parameters; multiprocessor interconnection networks; performance evaluation; workstation clusters;",
month = "Feb",
organization = "IEEE",
pages = "102 - 109",
title = "{S}tudying the effect of the design parameters on the interconnection network performance in {NOW}s",
url = "http://dx.doi.org/10.1109/EMPDP.2005.40",
year = 2005
}

Francisco J Alfaro, Jose L Sanchez and Jose Duato. {S. 989 - 994. BibTeX

@conference{ 2005459464579,
	author = "Francisco J. Alfaro and Jose L. Sanchez and Duato, Jose",
	abstract = "InfiniBand (IBA) has been proposed as an industry-standard architecture both for I/O server and interprocessor communication. IBA employs a switched point-to-point network, instead of using a shared bus. IBA is being developed by the InfiniBand^SM Trade Association to provide present and future server systems with the required levels of reliability, availability, performance, scalability, and quality of service (QoS). In previous papers we have proposed an effective strategy for configuring the IBA networks to provide users with the required levels of QoS. This strategy is based on the proper configuration of the mechanisms IBA carries to support QoS. Specifically, our methodology configures the InfiniBand Arbitration Tables and uses the different Service Levels and Virtual Lanes that are available, in order to segregate the different traffic flows. Thus, each flow receives the treatment it has previously requested. Moreover, by using our methodology, applications can be assured that their requirements will be satisfied. In this paper, we review the basis of our methodology and we study the influence of the packet size on the QoS guaranteed to the applications. {{\&}}copy; 2005 IEEE.",
	address = "Murcia, Spain",
	issn = 15301346,
	journal = "Proceedings - IEEE Symposium on Computers and Communications",
	key = "Servers",
	keywords = "Computer architecture;Program processors;Quality of service;",
	note = "Interprocessor communication;Point-to-point network;Point-to-point networks;",
	pages = "989 - 994",
	title = "{S"
}

F J Alfaro, J L Sanchez and Jose Duato. {S. 989 - 94. BibTeX

@conference{ 8642751,
	author = "F.J. Alfaro and J.L. Sanchez and Duato, Jose",
	abstract = "InfiniBand (IBA) has been proposed as an industry-standard architecture both for I/O server and interprocessor communication. IBA employs a switched point-to-point network, instead of using a shared bus. IBA is being developed by the InfiniBand_SM Trade Association to provide present and future server systems with the required levels of reliability, availability, performance, scalability, and quality of service (QoS). In previous papers we have proposed an effective strategy for configuring the IBA networks to provide users with the required levels of QoS. This strategy is based on the proper configuration of the mechanisms IBA carries to support QoS. Specifically, our methodology configures the InfiniBand arbitration tables and uses the different service levels and virtual lanes that are available, in order to segregate the different traffic flows. Thus, each flow receives the treatment it has previously requested. Moreover, by using our methodology, applications can be assured that their requirements will be satisfied. In this paper, we review the basis of our methodology and we study the influence of the packet size on the QoS guaranteed to the applications",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings. 10th IEEE Symposium on Computers and Communications",
	keywords = "network servers;packet switching;quality of service;telecommunication traffic;",
	note = "InfiniBand packet size;QoS;industry-standard architecture;I/O server;interprocessor communication;switched point-to-point network;server systems;quality of service;InfiniBand arbitration tables;service levels;virtual lanes;traffic flows;",
	pages = "989 - 94",
	title = "{S"
}

Blanca Caminero, Carmen Carrion, Francisco J Quiles, Jose Duato and Sudhakar Yalamanchili. Traffic scheduling solutions with QoS support for an input-buffered multimedia router. IEEE Transactions on Parallel and Distributed Systems 16(11):1009 - 1021, 2005. URL BibTeX

@article{ 2005499529197,
	author = "Blanca Caminero and Carmen Carrion and Francisco J. Quiles and Duato, Jose and Sudhakar Yalamanchili",
	abstract = "Quality of Service (QoS) support in local and cluster area environments has become an issue of great interest in recent years. Most current high-performance interconnection solutions for these environments have been designed to enhance conventional best-effort traffic performance, but are not well-suited to the special requirements of the new multimedia applications. The MultiMedia Router (MMR) aims at offering hardware-based QoS support within a compact interconnection component. One of the key elements in the MMR architecture are the algorithms used in traffic scheduling. These algorithms are responsible for the order in which information is forwarded through the internal switch. Thus, they are closely related to the QoS-provisioning mechanisms. In this paper, several traffic scheduling algorithms developed for the MMR architecture are described. Their general organization is motivated by chances for parallelization and pipelining, while providing the necessary support both to multimedia flows and to best-effort traffic. Performance evaluation results show that the QoS requirements of different connections are met, in spite of the presence of best-effort traffic, while achieving high link utilizations. {{\&}}copy; 2005 IEEE.",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Data communication systems",
	keywords = "Local area networks;Multimedia systems;Parallel processing systems;Pipeline processing systems;Quality of service;Routers;Switching networks;Telecommunication traffic;",
	note = "Cluster networks;Input-buffered multimedia router;Link scheduling;Switch architecture;Switch scheduling;",
	number = 11,
	pages = "1009 - 1021",
	title = "{T}raffic scheduling solutions with {Q}o{S} support for an input-buffered multimedia router",
	url = "http://dx.doi.org/10.1109/TPDS.2005.140",
	volume = 16,
	year = 2005
}

P Morillo, J M Orduna, M Fernandez and Jose Duato. A comparison study of metaheuristic techniques for providing QoS to avatars in DVE systems. 2004, 661 - 70. BibTeX

@conference{ 8179516,
	author = "P. Morillo and J.M. Orduna and M. Fernandez and Duato, Jose",
	abstract = "Network-server architecture has become a de-facto standard for distributed virtual environment (DVE) systems. In these systems, a large set of remote users share a 3D virtual scene. In order to design scalable DVE systems, different approaches have been proposed to maintain the DVE system working under its saturation point, maximizing system throughput. Also, in order to provide quality of service to avatars in a DVE systems, avatars should be assigned to servers taking into account, among other factors, system throughput and system latency. This highly complex problem is called quality of service (QoS) problem in DVE systems. This paper proposes two different approaches for solving the QoS problem, based on modern heuristics (simulated annealing and GRASP). Performance evaluation results show that the proposed strategies are able no only to provide quality of service to avatars in a DVE system, but also to keep the system away from the saturation point",
	address = "Berlin, Germany",
	journal = "Computational Science and it's Applications - ICCSA 2004. International Conference. Proceedings (Lecture Notes in Comput. Sci. Vol.3044)",
	keywords = "client-server systems;quality of service;simulated annealing;virtual reality;",
	note = "QoS;avatars;network-server architecture;distributed virtual environment;simulated annealing;GRASP;quality of service;",
	pages = "661 - 70",
	title = "{A} comparison study of metaheuristic techniques for providing {Q}o{S} to avatars in {DVE} systems",
	volume = "Vol.2",
	year = 2004
}

Jose Duato, Jose Flich and Teresa Nachiondo. A cost-effective technique to reduce HOL blocking in single-stage and multistage switch fabrics. In Parallel, Distributed and Network-Based Processing, 2004. Proceedings. 12th Euromicro Conference on. February 2004, 48 - 53. URL, DOI BibTeX

@conference{ 1271426,
	author = "Duato, Jose and Flich, Jose and Nachiondo, Teresa",
	abstract = "Head-of-line (HOL) blocking is one of the main problems arising in input-buffered switches. The best-known solution to this problem consists of using virtual output queues (VOQs). However this strategy is not scalable at all. Its implementation cost increases quadratically with the number of ports in the switch. Taking into account current trends, the demand for larger number of ports in high-performance switches is likely to increase very rapidly in the near future. Therefore, a more scalable and cost-effective solution is required. We propose a very efficient and cost-effective technique, referred to as destination-based buffer management (DBBM), to reduce HOL blocking in single-stage and multistage switch. Results show that the use of the DBBM technique with a reduced number of queues at each IA is able to obtain roughly the same throughput as the VOQ mechanism. In particular, the number of queues can be reduced by a factor of up to 8 with the DBBM technique.",
	booktitle = "Parallel, Distributed and Network-Based Processing, 2004. Proceedings. 12th Euromicro Conference on",
	doi = "10.1109/EMPDP.2004.1271426",
	isbn = "0-7695-2083-9",
	issn = "1066-6192",
	keywords = "cost-effective technique; destination-based buffer management; head-of-line blocking; input-buffered switches; multistage switch fabrics; single-stage switch fabrics; virtual output queues; IP networks; buffer storage; packet switching; queueing theory;",
	month = "Feb",
	pages = "48 - 53",
	title = "{A} cost-effective technique to reduce {HOL} blocking in single-stage and multistage switch fabrics",
	url = "http://dx.doi.org/10.1109/EMPDP.2004.1271426",
	year = 2004
}

P Morillo, J M Orduna, M Fernandez and Jose Duato. A fine-grain method for solving the partitioning problem in distributed virtual environment systems. 2004, 292 - 297. BibTeX

@conference{ 2005048802401,
	author = "P. Morillo and J.M. Orduna and M. Fernandez and Duato, Jose",
	abstract = "Distributed Virtual Environment (DVE) systems have experienced a spectacular growth last years. The partitioning problem has been proven as the most critical issue in order to design scalable and efficient DVE systems. It consists of efficiently assigning clients (3-D avatars) to the servers in the system, and some methods have been proposed for solving it. However, only two of these methods take into account the non-linear behavior of DVE servers with the number of avatars attached to them. In this paper, we propose a fine-grain load balancing technique for solving the partitioning problem in DVE systems. Unlike a previously proposed technique, this proposal takes into account the estimated state of the target server before re-assigning avatars. The exceeding workload that causes the saturation of a given server is proportionally distributed among several servers, if necessary. This method avoids the cascading effect, and it allows to increase system throughput with few re-assignments of avatars. Evaluation results show that the proposed method can improve DVE system performance, regardless of both the movement pattern and also the initial distribution of avatars in the virtual world.",
	address = "Cambridge, MA, United states",
	issn = 10272658,
	journal = "Proceedings of the IASTED International Conference on Parallel and Distributed Computing and Systems",
	key = "Distributed computer systems",
	keywords = "Cascade connections;Client server computer systems;Computer simulation;Internet;Network protocols;Problem solving;Virtual reality;",
	note = "Cascading effect;Distributed virtual environments (DVE);Inter-server communications;Load balancing;",
	pages = "292 - 297",
	title = "{A} fine-grain method for solving the partitioning problem in distributed virtual environment systems",
	volume = 16,
	year = 2004
}

N A Nordbotten, Maria E Gomez, Jose Flich, Pedro Lopez, Antonio Robles, T Skeie, O Lysne and Jose Duato. A fully adaptive fault-tolerant routing methodology based on intermediate nodes. 2004, 341 - 56. BibTeX

@conference{ 8322959,
	author = "N.A. Nordbotten and Gomez, Maria E. and Flich, Jose and Lopez, Pedro and Robles, Antonio and T. Skeie and O. Lysne and Duato, Jose",
	abstract = "Massively parallel computing systems are being built with thousands of nodes. Because of the high number of components, it is critical to keep these systems running even in the presence of failures. Interconnection networks play a key-role in these systems, and this paper proposes a fault-tolerant routing methodology for use in such networks. The methodology supports any minimal routing function (including fully adaptive routing), does not degrade performance in the absence of faults, does not disable any healthy node, and is easy to implement both in meshes and tori. In order to avoid network failures, the methodology uses a simple mechanism: for some source-destination pairs, packets are forwarded to the destination node through a set of intermediate nodes (without being ejected from the network). The methodology is shown to tolerate a large number of faults (e.g., five/nine faults when using two/three intermediate nodes in a 3D torus). Furthermore, the methodology offers a gracious performance degradation: in an 8 × 8 × 8 torus network with 14 faults the throughput is only decreased by 6.49%",
	address = "Germany, Germany",
	journal = "Network and Parallel Computing. IFIP International Conference, NPC 2004. Proceedings (Lecture Notes in Computer Science Vol.3222)",
	keywords = "fault tolerant computing;multiprocessor interconnection networks;packet switching;parallel processing;telecommunication network routing;",
	note = "fully adaptive fault-tolerant routing;intermediate nodes;massively parallel computing systems;interconnection networks;minimal routing function;network failures;source-destination pairs;",
	pages = "341 - 56",
	title = "{A} fully adaptive fault-tolerant routing methodology based on intermediate nodes",
	year = 2004
}

JE Villalobos, JL Sanchez, JA Gamez, JC Sancho and Antonio Robles. A methodology to evaluate the effectiveness of traffic balancing algorithms. In M Danelutto, D Laforenza and M Vanneschi (eds.). EURO-PAR 2004 PARALLEL PROCESSING, PROCEEDINGS 3149. 2004, 891-899. BibTeX

@conference{ isi:000223792500118,
	author = "JE Villalobos and JL Sanchez and JA Gamez and JC Sancho and Robles, Antonio",
	abstract = "Traffic balancing algorithms represent a cost-effective alternative to balance traffic in high performance interconnection networks. The importance of these algorithms is increasing since most of the current network technologies for clusters are either based on source routing or use deterministic routing. In source-routed networks, the host is responsible for selecting the suitable path among the set of paths provided by the routing algorithm. The selection of an optimal path that maximizes the channel utilization is not trivial because of the huge amount of combinations. Traffic balancing algorithms are based on heuristics in order to find an optimal solution. In this paper, we propose a new methodology based on the use of metaheuristic algorithms to evaluate the effectiveness of traffic balancing algorithms. Preliminary results show that the set of paths provided by current traffic balancing algorithms are still far from an optimized solution. Thus, it is worth continuing to design more efficient traffic balancing algorithms.",
	booktitle = "EURO-PAR 2004 PARALLEL PROCESSING, PROCEEDINGS",
	editor = "Danelutto, M and Laforenza, D and Vanneschi, M",
	isbn = 3540229248,
	issn = "0302-9743",
	note = "10th International Euro-Par Conference on Parallel Processing, Pisa, ITALY, 2004",
	pages = "891-899",
	series = "LECTURE NOTES IN COMPUTER SCIENCE",
	title = "{A} methodology to evaluate the effectiveness of traffic balancing algorithms",
	volume = 3149,
	year = 2004
}

Maria E Gomez, Jose Duato, Jose Flich, Pedro Lopez, Antonio Robles, N A Nordbotten, T Skeie and O Lysne. A new adaptive fault-tolerant routing methodology for direct networks. 2004, 462 - 73. BibTeX

@conference{ 8426282,
	author = "Gomez, Maria E. and Duato, Jose and Flich, Jose and Lopez, Pedro and Robles, Antonio and N.A. Nordbotten and T. Skeie and O. Lysne",
	abstract = "Interconnection networks play a key role in the fault tolerance of massively parallel computers, since faults may isolate a large fraction of the machine containing many healthy nodes. In this paper, we present a methodology to design fully adaptive fault-tolerant routing algorithms for direct interconnection networks that can be applied to different regular topologies. The methodology is mainly based on the selection of an intermediate node (if needed) for each source-destination pair. Packets are adaptively routed to the intermediate node and, from this node, they are adaptively forwarded to their destination. This methodology requires only one additional virtual channel, even for tori. Evaluation results show that the methodology is 7-fault tolerant, and for up to 14 faults, more than 99% of the combinations are tolerated, also without significantly degrading performance in the presence of faults",
	address = "Berlin, Germany",
	journal = "High Performance Computing-HiPC 2004. 11th International Conference (Lecture notes in Computer Science Vol.3296)",
	keywords = "fault tolerant computing;multiprocessor interconnection networks;parallel processing;telecommunication network routing;telecommunication network topology;",
	note = "adaptive fault-tolerant routing;direct interconnection networks;massively parallel computers;",
	pages = "462 - 73",
	title = "{A} new adaptive fault-tolerant routing methodology for direct networks",
	year = 2004
}

J M Montañana, Jose Flich, Antonio Robles, Pedro Lopez and Jose Duato. A transition-based fault-tolerant routing methodology for InfiniBand networks. In Parallel and Distributed Processing Symposium, 2004. Proceedings. 18th International. April 2004, 186. URL, DOI BibTeX

@conference{ 1303198,
author = "Monta{\~n}ana, J. M. and Flich, Jose and Robles, Antonio and Lopez, Pedro and Duato, Jose",
abstract = "Summary form only given. Currently, clusters of PCs are considered a cost-effective alternative to large parallel computers. As the number of elements increases in these systems, the probability of faults increases dramatically. Therefore, it is critical to keep the system running even in the presence of faults. The interconnection network plays a key role in its performance. InfiniBand (IBA) is a new standard interconnect suitable for clusters. Most of the fault-tolerant routing strategies proposed for massively parallel computers cannot be applied to IBA because routing and virtual channel transitions are deterministic, which prevents packets from avoiding the faults. A possible approach to provide fault-tolerance in IBA consists of using several disjoint paths between every source-destination pair of nodes and selecting the appropriate path at the source host. However, to this end, a routing algorithm able to provide enough disjoint paths, while still guaranteeing deadlock freedom, is required. We propose a simple and effective fault-tolerant methodology for IBA networks that can be applied to any network topology and meets the trade-off between fault-tolerance degree and the number of network resources devoted to it. Preliminary results show that the proposed methodology scales well and supports up to three faults in 2D and five in 3D tori using only two virtual channels.",
booktitle = "Parallel and Distributed Processing Symposium, 2004. Proceedings. 18th International",
doi = "10.1109/IPDPS.2004.1303198",
isbn = "0-7695-2132-0",
issn = "",
keywords = "fault tolerant computing;multiprocessor interconnection networks;network topology;parallel machines;telecommunication network routing;workstation clusters;",
month = "april",
pages = 186,
title = "{A} transition-based fault-tolerant routing methodology for {I}nfini{B}and networks",
url = "http://dx.doi.org/10.1109/IPDPS.2004.1303198",
year = 2004
}

Manuel E Acacio, Jose Gonzalez, Jose M Garcia and Jose Duato. An architecture for high-performance scalable shared-memory multiprocessors exploiting on-chip integration. IEEE Transactions on Parallel and Distributed Systems 15(8):755 - 768, 2004. URL BibTeX

@article{ 2004368344587,
	author = "Manuel E. Acacio and Jose Gonzalez and Jose M. Garcia and Duato, Jose",
	abstract = "Recent technology improvements allow multiprocessor designers to put some key components inside the processor chip, such as the memory controller, the coherence hardware, and the network interface/router. In this paper, we exploit such integration scale, presenting a novel node architecture aimed at reducing the long L2 miss latencies and the memory overhead of using directories that characterize cc-NUMA machines and limit their scalability. Our proposal replaces the traditional directory with a novel three-level directory architecture, as well as it adds a small shared data cache to each of the nodes of a multiprocessor system. Due to their small size, the first-level directory and the shared data cache are integrated into the processor chip in every node, which enhances performance by saving accesses to the slower main memory. Scalability is guaranteed by having the second and third-level directories out of the processor chip and using compressed data structures. A taxonomy of the L2 misses, according to the actions performed by the directory to satisfy them, is also presented. Using execution-driven simulations, we show that significant latency reductions can be obtained by using the proposed node architecture, which translates into reductions of more than 30 percent in several cases in the application execution time. © 2004 IEEE.",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Data storage equipment",
	keywords = "Cache memory;Computer architecture;Computer simulation;Computer systems;Interfaces;Microprocessor chips;Routers;",
	note = "Directory memory overhead;Multiprocessor system;Shared data cache;Shared memory multiprocessors;Three level directory;",
	number = 8,
	pages = "755 - 768",
	title = "{A}n architecture for high-performance scalable shared-memory multiprocessors exploiting on-chip integration",
	url = "http://dx.doi.org/10.1109/TPDS.2004.27",
	volume = 15,
	year = 2004
}

Maria E Gomez, Jose Flich, Pedro Lopez, Antonio Robles, Jose Duato, N A Nordbotten, O Lysne and T Skeie. An effective fault-tolerant routing methodology for direct networks. In Parallel Processing, 2004. ICPP 2004. International Conference on. 2004, 222 - 231 vol.1. URL, DOI BibTeX

@conference{ 1327925,
author = "Gomez, Maria E. and Flich, Jose and Lopez, Pedro and Robles, Antonio and Duato, Jose and N.A. Nordbotten and O. Lysne and T. Skeie",
abstract = "Current massively parallel computing systems are being built with thousands of nodes, which significantly affect the probability of failure. M. E. Gomez proposed a methodology to design fault-tolerant routing algorithms for direct interconnection networks. The methodology uses a simple mechanism: for some source-destination pairs, packets are first forwarded to an intermediate node, and later, from this node to the destination node. Minimal adaptive routing is used along both subpaths. For those cases where the methodology cannot find a suitable intermediate node, it combines the use of intermediate nodes with two additional mechanisms: disabling adaptive routing and using misrouting on a per-packet basis. While the combination of these three mechanisms tolerates a large number of faults, each one requires adding some hardware support in the network and also introduces some overhead. In this paper, we perform an in-depth detailed analysis of the impact of these mechanisms on network behaviour. We analyze the impact of the three mechanisms separately and combined. The ultimate goal of this paper is to obtain a suitable combination of mechanisms that is able to meet the trade-off between fault-tolerance degree, routing complexity, and performance.",
booktitle = "Parallel Processing, 2004. ICPP 2004. International Conference on",
doi = "10.1109/ICPP.2004.1327925",
issn = "0190-3918",
keywords = "direct networks; fault-tolerant routing algorithm; in-depth detailed analysis; interconnection networks; minimal adaptive routing; parallel computing system; communication complexity; fault tolerant computing; multiprocessor interconnection networks; par",
month = "aug.",
pages = "222 - 231 vol.1",
title = "{A}n effective fault-tolerant routing methodology for direct networks",
url = "http://dx.doi.org/10.1109/ICPP.2004.1327925",
year = 2004
}

Maria E Gomez, Jose Flich, Pedro Lopez, Antonio Robles, Jose Duato, N A Nordbotten, O Lysne and T Skeie. An effective fault-tolerant routing methodology for direct networks. 2004, 222 - 31. BibTeX

@conference{ 8279975,
author = "Gomez, Maria E. and Flich, Jose and Lopez, Pedro and Robles, Antonio and Duato, Jose and N.A. Nordbotten and O. Lysne and T. Skeie",
abstract = "Current massively parallel computing systems are being built with thousands of nodes, which significantly affect the probability of failure. M. E. Gomex proposed a methodology to design fault-tolerant routing algorithms for direct interconnection networks. The methodology uses a simple mechanism: for some source-destination pairs, packets are first forwarded to an intermediate node, and later, from this node to the destination node. Minimal adaptive routing is used along both subpaths. For those cases where the methodology cannot find a suitable intermediate node, it combines the use of intermediate nodes with two additional mechanisms: disabling adaptive routing and using misrouting on a per-packet basis. While the combination of these three mechanisms tolerates a large number of faults, each one requires adding some hardware support in the network and also introduces some overhead. In this paper, we perform an in-depth detailed analysis of the impact of these mechanisms on network behaviour. We analyze the impact of the three mechanisms separately and combined. The ultimate goal of this paper is to obtain a suitable combination of mechanisms that is able to meet the trade-off between fault-tolerance degree, routing complexity, and performance",
address = "Los Alamitos, CA, USA",
journal = "2004 International Conference on Parallel Processing",
keywords = "communication complexity;fault tolerant computing;multiprocessor interconnection networks;parallel processing;",
note = "parallel computing system;fault-tolerant routing algorithm;interconnection networks;minimal adaptive routing;in-depth detailed analysis;direct networks;",
pages = "222 - 31",
title = "{A}n effective fault-tolerant routing methodology for direct networks",
volume = "vol.1",
year = 2004
}

JC Sancho, Antonio Robles and Jose Duato. An effective methodology to improve the performance of the Up*/down* routing algorithm. IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS 15(8):740-754, August 2004. BibTeX

@article{ isi:000222073200006,
	author = "JC Sancho and Robles, Antonio and Duato, Jose",
	abstract = "Networks of workstations (NOWs) are being considered as a cost-effective alternative to parallel computers. Most NOWs are arranged as a switch-based network and provide mechanisms for discovering the network topology. Hence, they provide support for both regular and irregular topologies, which makes routing and deadlock avoidance quite complicated. Current proposals use the Up{*}/down{*} routing algorithm to remove cyclic dependencies between channels and avoid deadlock. However, routing is considerably restricted and most messages must follow nonminimal paths, increasing latency and wasting resources. In this work, we propose and evaluate a simple and effective methodology to compute Up{*}/down{*} routing tables. The new methodology is based on computing a depth-first search (DFS) spanning tree on the network graph that decreases the number of routing restrictions with respect to the breadth-first search (BFS) spanning tree used by the traditional methodology. Additionally, we propose different heuristic rules for computing the spanning trees to improve the efficiency of Up{*}/down{*} routing. Evaluation results for several different topologies show that computing the Up{*}/down{*} routing tables by using the new methodology increases throughput by a factor of up to 2.48 in large networks with respect to the traditional methodology, and also reduces latency significantly.",
	issn = "1045-9219",
	journal = "IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS",
	month = "AUG",
	number = 8,
	pages = "740-754",
	title = "{A}n effective methodology to improve the performance of the {U}p{*}/down{*} routing algorithm",
	volume = 15,
	year = 2004
}

Jose C Sancho, Antonio Robles and Jose Duato. An effective methodology to improve the performance of the up*/down* routing algorithm. IEEE Transactions on Parallel and Distributed Systems 15(8):740 - 754, 2004. URL BibTeX

@article{ 2004368344586,
	author = "Jose C. Sancho and Robles, Antonio and Duato, Jose",
	abstract = "Networks of workstations (NOWs) are being considered as a cost-effective alternative to parallel computers. Most NOWs are arranged as a switch-based network and provide mechanisms for discovering the network topology. Hence, they provide support for both regular and irregular topologies, which makes routing and deadlock avoidance quite complicated. Current proposals use the Up*/down* routing algorithm to remove cyclic dependencies between channels and avoid deadlock. However, routing is considerably restricted and most messages must follow nonminimal paths, increasing latency and wasting resources. In this work, we propose and evaluate a simple and effective methodology to compute Up*/down* routing tables. The new methodology is based on computing a depth-first search (DFS) spanning tree on the network graph that decreases the number of routing restrictions with respect to the breadth-first search (BFS) spanning tree used by the traditional methodology. Additionally, we propose different heuristic rules for computing the spanning trees to improve the efficiency of Up*/down* routing. Evaluation results for several different topologies show that computing the Up*/down* routing tables by using the new methodology increases throughput by a factor of up to 2.48 in large networks with respect to the traditional methodology, and also reduces latency significantly. © 2004 IEEE.",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Computer networks",
	keywords = "Algorithms;Computer simulation;Computer system recovery;Interconnection networks;Parallel processing systems;Trees;",
	note = "Deadlock avoidance;Irregular topologies;Routing algorithms;Spanning tree;",
	number = 8,
	pages = "740 - 754",
	title = "{A}n effective methodology to improve the performance of the up*/down* routing algorithm",
	url = "http://dx.doi.org/10.1109/TPDS.2004.28",
	volume = 15,
	year = 2004
}

J C Sancho, Antonio Robles and Jose Duato. An effective methodology to improve the performance of the up*/down* routing algorithm. IEEE Transactions on Parallel and Distributed Systems 15(8):740 - 54, 2004. URL BibTeX

@article{ 8115437,
	author = "J.C. Sancho and Robles, Antonio and Duato, Jose",
	abstract = "Networks of workstations (NOWs) are being considered as a cost-effective alternative to parallel computers. Most NOWs are arranged as a switch-based network and provide mechanisms for discovering the network topology. Hence, they provide support for both regular and irregular topologies, which makes routing and deadlock avoidance quite complicated. Current proposals use the up*/down* routing algorithm to remove cyclic dependencies between channels and avoid deadlock. However, routing is considerably restricted and most messages must follow nonminimal paths, increasing latency and wasting resources. We propose and evaluate a simple and effective methodology to compute up*/down* routing tables. The new methodology is based on computing a depth-first search (DPS) spanning tree on the network graph that decreases the number of routing restrictions with respect to the breadth-first search (BFS) spanning tree used by the traditional methodology. Additionally, we propose different heuristic rules for computing the spanning trees to improve the efficiency of up*/down* routing. Evaluation results for several different topologies show that computing the up*/down* routing tables by using the new methodology increases throughput by a factor of up to 2.48 in large networks with respect to the traditional methodology, and also reduces latency significantly",
	address = "USA",
	issn = "1045-9219",
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	keywords = "concurrency theory;network operating systems;network topology;telecommunication network routing;tree searching;workstation clusters;",
	note = "up*/down* routing algorithm;networks of workstations;depth-first search spanning tree;network graph;breadth-first search;irregular topologies;deadlock avoidance;",
	number = 8,
	pages = "740 - 54",
	title = "{A}n effective methodology to improve the performance of the up*/down* routing algorithm",
	url = "http://dx.doi.org/10.1109/TPDS.2004.28",
	volume = 15,
	year = 2004
}

Maria E Gomez, Jose Duato, Jose Flich, Pedro Lopez, Antonio Robles, N A Nordbotten, O Lysne and T Skeie. An Efficient Fault-Tolerant Routing Methodology for Meshes and Tori. Computer Architecture Letters 3(1):3 - 3, 2004. URL, DOI BibTeX

@article{ 1650124,
	author = "Gomez, Maria E. and Duato, Jose and Flich, Jose and Lopez, Pedro and Robles, Antonio and N.A. Nordbotten and O. Lysne and T. Skeie",
	abstract = "In this paper we present a methodology to design fault-tolerant routing algorithms for regular direct interconnection networks. It supports fully adaptive routing, does not degrade performance in the absence of faults, and supports a reasonably large number of faults without significantly degrading performance. The methodology is mainly based on the selection of an intermediate node (if needed) for each source-destination pair. Packets are adaptively routed to the intermediate node and, at this node, without being ejected, they are adaptively forwarded to their destinations. In order to allow deadlock-free minimal adaptive routing, the methodology requires only one additional virtual channel (for a total of three), even for tori. Evaluation results for a 4 x 4 x 4 torus network show that the methodology is 5-fault tolerant. Indeed, for up to 14 link failures, the percentage of fault combinations supported is higher than 99.96%. Additionally, network throughput degrades by less than 10% when injecting three random link faults without disabling any node. In contrast, a mechanism similar to the one proposed in the BlueGene/L, that disables some network planes, would strongly degrade network throughput by 79%.",
	doi = "10.1109/L-CA.2004.1",
	issn = "1556-6056",
	journal = "Computer Architecture Letters",
	month = "january-december",
	number = 1,
	pages = "3 - 3",
	title = "{A}n {E}fficient {F}ault-{T}olerant {R}outing {M}ethodology for {M}eshes and {T}ori",
	url = "http://dx.doi.org/10.1109/L-CA.2004.1",
	volume = 3,
	year = 2004
}

J Domenech, A Pont, Julio Sahuquillo and J A Gil. An experimental framework for testing Web prefetching techniques. In Euromicro Conference, 2004. Proceedings. 30th. 2004, 214 - 221. URL, DOI BibTeX

@conference{ 1333374,
author = "J. Domenech and A. Pont and Sahuquillo, Julio and J.A. Gil",
abstract = "The popularity of Web objects, and by extension the popularity of the Web sites, besides the appearance of clear footprints in user's accesses that show a considerable spatial locality, make possible to predict future accesses based on the current ones. This fact permits to implement also prefetching techniques in Web architecture in order to reduce the latency perceived by the users. Although the open literature presents some approaches in this sense, the huge variety of prefetching algorithms, and the different scenarios and conditions where they are applied make very difficult to compare performance and to obtain correct conclusions that permit researchers to improve their proposals or even detect in which conditions one solution is more convenient than others. This is the main reason why we propose A new and free available environment in order to implement and study prefetching techniques efficiently. Our framework is a hybrid implementation that combines both real and simulated parts in order to provide flexibility and accuracy. It reproduces in detail the behavior of Web users, proxy severs and original servers. The simulator also includes a module to provide performance results, such as precision (prefetching accuracy), recall, response time, and byte transference.",
booktitle = "Euromicro Conference, 2004. Proceedings. 30th",
doi = "10.1109/EURMIC.2004.1333374",
isbn = "0-7695-2199-1",
keywords = "Internet latency; Web architecture performance; Web prefetching techniques; Web sites; user access; Internet; computer network reliability; storage management;",
month = "31 aug.-3 sept.",
pages = "214 - 221",
publisher = "IEEE Computer Society",
title = "{A}n experimental framework for testing {W}eb prefetching techniques",
url = "http://dx.doi.org/10.1109/EURMIC.2004.1333374",
year = 2004
}

Salvador Petit, Julio Sahuquillo, A Pont and D Kaeli. Characterizing the dynamic behavior of workload execution in SVM systems. In Computer Architecture and High Performance Computing, 2004. SBAC-PAD 2004. 16th Symposium on. 2004, 230 - 237. URL, DOI BibTeX

@conference{ 1364758,
	author = "Petit, Salvador and Sahuquillo, Julio and A. Pont and D. Kaeli",
	abstract = "The overhead associated with software management of shared virtual memory (SVM) systems can seriously impact overall system performance. One way to remedy this situation is to design more efficient SVM consistency protocols. In this paper we study a number of parallel workload characteristics that can negatively impact the performance of SVM systems. We attempt to quantify the sources of performance loss in some parallel workloads. Our goal is to better understand these characteristics, enabling us to develop SVM protocols that can adjust to dynamics in workload behavior. This paper has three main contributions: i) we measure the contention for synchronization resources, showing how applications exhibit distinct phases during their execution, ii) we quantify the relationship between page size and fragmentation/false sharing while varying the sharing unit size, and iii) we study the synergies between the contention for synchronization resources and fragmentation/false sharing, providing hints for developing improved protocols.",
	booktitle = "Computer Architecture and High Performance Computing, 2004. SBAC-PAD 2004. 16th Symposium on",
	doi = "10.1109/SBAC-PAD.2004.12",
	isbn = "0-7695-2240-8",
	keywords = "SVM consistency protocols; parallel workload characteristics; shared virtual memory system; software management; synchronization resources; workload execution; performance evaluation; protocols; resource allocation; shared memory systems; synchronisation",
	month = "oct.",
	pages = "230 - 237",
	title = "{C}haracterizing the dynamic behavior of workload execution in {SVM} systems",
	url = "http://dx.doi.org/10.1109/SBAC-PAD.2004.12",
	year = 2004
}

J M Stine, N P Carter and Jose Flich. Comparing Adaptive Routing and Dynamic Voltage Scaling for Link Power Reduction. Computer Architecture Letters 3(1):4 - 4, 2004. DOI BibTeX

@article{ 1650125,
	author = "J.M. Stine and N.P. Carter and Flich, Jose",
	abstract = "We compare techniques that dynamically scale the voltage of individual network links to reduce power consumption with an approach in which all links in the network are set to the same voltage and adaptive routing is used to distribute load across the network. Our results show that adaptive routing with static network link voltages outperforms dimension-order routing with dynamic link voltages in all cases, because the adaptive routing scheme can respond more quickly to changes in network demand. Adaptive routing with static link voltages also outperforms adaptive routing with dynamic link voltages in many cases, although dynamic link voltage scaling gives better behavior as the demand on the network grows.",
	doi = "10.1109/L-CA.2004.5",
	issn = "1556-6056",
	journal = "Computer Architecture Letters",
	month = "january-december",
	number = 1,
	pages = "4 - 4",
	title = "{C}omparing {A}daptive {R}outing and {D}ynamic {V}oltage {S}caling for {L}ink {P}ower {R}eduction",
	volume = 3,
	year = 2004
}

Bilal Zafar, Timothy M Pinkston, Aurelio Bermudez and Jose Duato. Deadlock-free dynamic reconfiguration over InfiniBand networks. 2004, 127 - 143. URL BibTeX

@conference{ 2004398371068,
	author = "Bilal Zafar and Timothy M. Pinkston and Aurelio Bermudez and Duato, Jose",
	abstract = "InfiniBand Architecture (IBA) is a newly established general-purpose interconnect standard applicable to local area, system area and storage area networking and I/O. Networks based on this standard should be capable of tolerating topological changes due to resource failures, link/switch activations, and/or hot swapping of components. In order to maintain connectivity, the network's routing function may need to be reconfigured on each topological change. Although the architecture has various mechanisms useful for configuring the network, no strategy or procedure is specified for ensuring deadlock freedom during dynamic network reconfiguration. In this paper, a method for applying the Double Scheme over InfiniBand networks is proposed. The Double Scheme provides a systematic way of reconfiguring a network dynamically while ensuring freedom from deadlocks. We show how features and mechanisms available in IBA for other purposes can also be used to implement dynamic network reconfiguration based on the Double Scheme. We also propose new mechanisms that may be considered in future versions of the IBA specification for making dynamic reconfiguration and other subnet management operations more efficient.",
	issn = 10637192,
	journal = "Parallel Algorithms and Applications",
	key = "Computer networks",
	keywords = "Bandwidth;Costs;Input output programs;Interconnection networks;Optimization;Probability;Quality of service;Routers;Servers;",
	note = "Deadlock-free dynamic reconfiguration;Double scheme;InfiniBand architecture;Network management;",
	number = "2-3",
	pages = "127 - 143",
	title = "{D}eadlock-free dynamic reconfiguration over {I}nfini{B}and networks",
	url = "http://dx.doi.org/10.1080/10637190410001725463",
	volume = 19,
	year = 2004
}

T Skeie, O Lysne, Jose Flich, Pedro Lopez, Antonio Robles and Jose Duato. LASH-TOR: a generic transition-oriented routing algorithm. In Parallel and Distributed Systems, 2004. ICPADS 2004. Proceedings. Tenth International Conference on. 2004, 595 - 604. URL, DOI BibTeX

@conference{ 1316144,
	author = "T. Skeie and O. Lysne and Flich, Jose and Lopez, Pedro and Robles, Antonio and Duato, Jose",
	abstract = "Cluster networks are seen as the future access networks for multimedia streaming, e-commerce, network storage, etc. For these applications, performance and high availability are particularly crucial. Regular topologies are preferred when performance is the primary concern. However, due to spatial constraints or fault-related issues, the network structure may become irregular, which makes more difficult to find deadlock-free minimal paths. Over the recent years, several solutions have been proposed. One of them is the LASH routing, which enables minimal routing by assigning paths to different virtual layers. In this paper, we propose an extension of LASH in order to reduce the number of required virtual layers by allowing transitions between virtual layers. Evaluation results show that the new routing scheme (LASH-TOR) is able to obtain full minimal routing with a reduced number of virtual channels. For torus and mesh networks, with only two virtual channels, LASH throughput is increased by an average factor of improvement of 3.30 for large networks. For regular networks with some unconnected (faulty) links, equal performance improvements are achieved. Even for highly irregular networks of size up to 128 switches the new routing scheme only needs three virtual channels for guaranteeing minimal routing. Besides, LASH-TOR performs well compared to dimension order routing for mesh and torus networks.",
	booktitle = "Parallel and Distributed Systems, 2004. ICPADS 2004. Proceedings. Tenth International Conference on",
	doi = "10.1109/ICPADS.2004.1316144",
	isbn = "0-7695-2152-5",
	issn = "1521-9097",
	keywords = "LASH routing; LASH-TOR; access networks; cluster networks; deadlock-free minimal paths; e-commerce; mesh network; multimedia streaming; network storage; network structure; spatial constraints; torus network; transition-oriented routing algorithm; virtual",
	month = "7-9",
	pages = "595 - 604",
	title = "{LASH}-{TOR}: a generic transition-oriented routing algorithm",
	url = "http://dx.doi.org/10.1109/ICPADS.2004.1316144",
	year = 2004
}

Juan Manuel Orduna, Federico Silla and Jose Duato. On the development of a communication-aware task mapping technique. Journal of Systems Architecture 50(4):207 - 220, 2004. URL BibTeX

@article{ 2004178128206,
	author = "Juan Manuel Orduna and Silla, Federico and Duato, Jose",
	abstract = "Clusters have become a very cost-effective platform for high-performance computing. In these systems, although currently existing networks actually provide enough bandwidth for the existing applications and workstations, the trend is towards the interconnection network becoming the system bottleneck. Therefore, in the future, scheduling strategies will have to take into account the communication requirements of the applications and the communication bandwidth that the network can offer. One of the key issues in these strategies is the task mapping technique used when the network becomes the system bottleneck. In this paper, we propose a communication-aware mapping technique that tries to match as well as possible the existing network resources to the communication requirements of the applications running on the system. Also, we evaluate the mapping technique using real MPI application traces with timestamps. Evaluation results show that the use of the proposed mapping technique better exploits the available network bandwidth, improving load balancing and increasing the throughput that can be delivered by the network. Therefore, the proposed technique can be used in the design of communication-aware scheduling strategies for those situations where the communication requirements lead the network bandwidth to become the system performance bottleneck. © 2003 Elsevier B.V. All rights reserved.",
	issn = 13837621,
	journal = "Journal of Systems Architecture",
	key = "Interconnection networks",
	keywords = "Bandwidth;Computational complexity;Computer systems;Cost effectiveness;Evaluation;Mapping;Problem solving;Program processors;Scheduling;",
	note = "Cluster computing;Task scheduling;",
	number = 4,
	pages = "207 - 220",
	title = "{O}n the development of a communication-aware task mapping technique",
	url = "http://dx.doi.org/10.1016/j.sysarc.2003.09.002",
	volume = 50,
	year = 2004
}

Jose Duato. Program chair's message. 2004, x - x. BibTeX

@conference{ 2004228179040,
	author = "Duato, Jose",
	abstract = "No abstract available",
	address = "Madrid, Spain",
	issn = 15300897,
	journal = "IEEE High-Performance Computer Architecture Symposium Proceedings",
	pages = "x - x",
	title = "{P}rogram chair's message",
	volume = 10,
	year = 2004
}

Francisco J Alfaro, Jose L Sanchez and Jose Duato. QoS in InfiniBand subnetworks. IEEE Transactions on Parallel and Distributed Systems 15(9):810 - 823, 2004. URL BibTeX

@article{ 2004408393368,
	author = "Francisco J. Alfaro and Jose L. Sanchez and Duato, Jose",
	abstract = "The InfiniBand Architecture (IBA) has been proposed as an industry standard both for communication between processing nodes and I/O devices and for interprocessor communication. It replaces the traditional bus-based interconnect with a switch-based network for connecting processing nodes and I/O devices. It is being developed by the InfiniBand^SM Trade Association (IBTA) in the aim to provide the levels of reliability, availability, performance, scalability, and quality of service (QoS) required by present and future server systems. For this purpose, IBA provides a series of mechanisms that are able to guarantee QoS to the applications. In previous papers, we have proposed a strategy to compute the InfiniBand arbitration tables. In one of these, we presented and evaluated our proposal to treat traffic with bandwidth requirements. In another, we evaluated our strategy to compute the InfiniBand arbitration tables for traffic with delay requirements, which is a more complex task. In this paper, we will evaluate both these proposals together. Furthermore, we will also adapt these proposals in order to treat VBR traffic without QoS guarantees, but achieving very good results. Performance results show that, with a correct treatment of each traffic class in the arbitration of the output port, all traffic classes reach their QoS requirements. {{\&}}copy; 2004 IEEE.",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Interconnection networks",
	keywords = "Availability;Bandwidth;Computer architecture;Computer simulation;Mathematical models;Performance;Quality of service;Reliability;Servers;Telecommunication links;Telecommunication traffic;",
	note = "InfiniBand architecture;Interprocessor communication;Physical link;QoS requirements;",
	number = 9,
	pages = "810 - 823",
	title = "{Q}o{S} in {I}nfini{B}and subnetworks",
	url = "http://dx.doi.org/10.1109/TPDS.2004.46",
	volume = 15,
	year = 2004
}

F J Alfaro, J L Sanchez and Jose Duato. QoS in InfiniBand subnetworks. IEEE Transactions on Parallel and Distributed Systems 15(9):810 - 23, 2004. BibTeX

@article{ 8094175,
	author = "F.J. Alfaro and J.L. Sanchez and Duato, Jose",
	abstract = "The InfiniBand architecture (IBA) has been proposed as an industry standard both for communication between processing nodes and I/O devices and for interprocessor communication. It replaces the traditional bus-based interconnect with a switch-based network for connecting processing nodes and I/O devices. It is being developed by the InfiniBandSM Trade Association (IBTA) in the aim to provide the levels of reliability, availability, performance, scalability, and quality of service (QoS) required by present and future server systems. For this purpose, IBA provides a series of mechanisms that are able to guarantee QoS to the applications. In previous papers, we have proposed a strategy to compute the InfiniBand arbitration tables. In one of these, we presented and evaluated our proposal to treat traffic with bandwidth requirements. In another, we evaluated our strategy to compute the InfiniBand arbitration tables for traffic with delay requirements, which is a more complex task. In this paper, we evaluate both these proposals together. Furthermore, we also adapt these proposals in order to treat VBR traffic without QoS guarantees, but achieving very good results. Performance results show that, with a correct treatment of each traffic class in the arbitration of the output port, all traffic classes reach their QoS requirements",
	address = "USA",
	issn = "1045-9219",
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	keywords = "bandwidth allocation;multiplexing;quality of service;queueing theory;telecommunication traffic;workstation clusters;",
	note = "InfiniBand architecture;I/O devices;interprocessor communication;bus-based interconnect;switch-based network;processing nodes;InfiniBand Trade Association;quality of service;QoS;InfiniBand arbitration tables;VBR traffic;InfiniBand subnetworks;",
	number = 9,
	pages = "810 - 23",
	title = "{Q}o{S} in {I}nfini{B}and subnetworks",
	volume = 15,
	year = 2004
}

Marina Alonso, J M Martinez, Vicente Santonja and Pedro Lopez. Reducing power consumption in interconnection networks by dynamically adjusting link width. 2004, 882 - 90. BibTeX

@conference{ 8314163,
	author = "Alonso, Marina and J.M. Martinez and Santonja, Vicente and Lopez, Pedro",
	abstract = "The huge increase both in size and complexity of high-end multiprocessor systems has triggered their power consumption. Air or liquid cooling systems are needed, which, in turn, increases power consumption. Another important percentage of the consumption is due to the interconnection network. In this paper, we propose a mechanism that dynamically reduces the available network bandwidth when traffic becomes low. Unlike other approaches that completely switch links off when they are not fully utilized, our mechanism is based on reducing their bandwidth by narrowing their width. As the topology of the network is not modified, the same routing algorithm can be used regardless of the power consumption level, which simplifies the router design. By using this strategy, the consumption may be strongly reduced. In fact, the lower bound of this reduction is a design parameter of the mechanism. The price to pay is an increase in the message latency with low network loads",
	address = "Berlin, Germany",
	journal = "Euro-Par 2004 Parallel Processing. 10th International Euro-Par Conference. Proceedings (Lecture Notes in Comput. Sci. Vol.3149)",
	keywords = "bandwidth allocation;multiprocessor interconnection networks;power consumption;telecommunication links;telecommunication network routing;telecommunication traffic;",
	note = "power consumption reduction;interconnection networks;link width adjustment;multiprocessor systems;network bandwidth;",
	pages = "882 - 90",
	title = "{R}educing power consumption in interconnection networks by dynamically adjusting link width",
	year = 2004
}

L G Cardenas, Julio Sahuquillo, A Pont and J A Gil. The multikey Web cache simulator: a platform for designing proxy cache management techniques. In Parallel, Distributed and Network-Based Processing, 2004. Proceedings. 12th Euromicro Conference on. 2004, 390 - 397. URL, DOI BibTeX

@conference{ 1271471,
	author = "L.G. Cardenas and Sahuquillo, Julio and A. Pont and J.A. Gil",
	abstract = "Proxy caches have become an important mechanism to reduce latencies. Efficient management techniques for proxy caches which exploits Web-objects inherent characteristics are an essential key to reach good performance. One important segment of the replacement algorithms being applied today are the multikey algorithms that use several key or object characteristics to decide which object or objects must be replaced. This feature is not considered in most of the current simulators. In this paper we propose a proxy-cache platform to check the performance of Web object based on multikey management techniques and algorithms. The proposed platform is coded in a modular way, which allows the implementation of new algorithms or policies proposals in an easy and robust manner. In addition to the classical performance metrics like the hit ratio and the byte hit ratio, the proposed framework also offers the response time perceived by users.",
	booktitle = "Parallel, Distributed and Network-Based Processing, 2004. Proceedings. 12th Euromicro Conference on",
	doi = "10.1109/EMPDP.2004.1271471",
	isbn = "0-7695-2083-9",
	keywords = "Web-objects; byte hit ratio; multikey Web cache simulator; multikey algorithms; multikey management techniques; proxy cache replacement algorithms; proxy caches management techniques; Internet; cache storage; digital simulation; performance evaluation;",
	month = "feb.",
	pages = "390 - 397",
	title = "{T}he multikey {W}eb cache simulator: a platform for designing proxy cache management techniques",
	url = "http://dx.doi.org/10.1109/EMPDP.2004.1271471",
	year = 2004
}

Aurelio Bermudez, Rafael Casado, Francisco J Quiles and Jose Duato. Use of provisional routes to speed-up change assimilation in infiniBand networks. 2004, 2621 - 2628. BibTeX

@conference{ 2005058819970,
	author = "Aurelio Bermudez and Rafael Casado and Francisco J. Quiles and Duato, Jose",
	abstract = "The InfiniBand architecture has been proposed as a technology both for communication between processing nodes and I/O devices, and for interprocessor communication. The InfiniBand specification defines a basic management infrastructure that is responsible for subnet configuration, activation, and fault tolerance. Each time a topology change is detected, management entities collect the current subnet topology. After that, new forwarding tables have to be computed and uploaded to routing devices. The time required to compute these tables is a critical issue, due to application traffic being negatively affected by the temporary lack of connectivity. In this paper we present a way to compute a valid set of subnet routes in a short period of time. These provisional routes can be immediately distributed to routing devices. After that, final routes can be later uploaded without affecting user traffic.",
	address = "Santa Fe, NM, United states",
	journal = "Proceedings - International Parallel and Distributed Processing Symposium, IPDPS 2004 (Abstracts and CD-ROM)",
	key = "Computer networks",
	keywords = "Algorithms;Information theory;Interfaces (computer);Packet networks;Program processors;Topology;",
	note = "Host channel adapters (HCA);InfiBand networks;Processor nodes;Subnets;",
	pages = "2621 - 2628",
	title = "{U}se of provisional routes to speed-up change assimilation in infini{B}and networks",
	volume = 18,
	year = 2004
}

A Bermudez, R Casado, F J Quiles and Jose Duato. Use of provisional routes to speed-up change assimilation in InfiniBand networks. 2004, 186 -. URL BibTeX

@conference{ 8126616,
	author = "A. Bermudez and R. Casado and F.J. Quiles and Duato, Jose",
	abstract = "Summary form only given. The InfiniBand architecture has been proposed as a technology both for communication between processing nodes and I/O devices, and for interprocessor communication. The InfiniBand specification defines a basic management infrastructure that is responsible for subnet configuration, activation, and fault tolerance. Each time a topology change is detected, management entities collect the current subnet topology. After that, new forwarding tables have to be computed and uploaded to routing devices. The time required to compute these tables is a critical issue, due to application traffic being negatively affected by the temporary lack of connectivity. We present a way to compute a valid set of subnet routes in a short period of time. These provisional routes can be immediately distributed to routing devices. After that, final routes can be later uploaded without affecting user traffic",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings. 18th International Parallel and Distributed Processing Symposium",
	keywords = "fault tolerant computing;multiprocessing systems;network topology;telecommunication network routing;telecommunication traffic;",
	note = "provisional route;infiniband network;InfiniBand architecture;processing node;I/O device;interprocessor communication;subnet configuration;fault tolerance;routing device;application traffic;user traffic;",
	pages = "186 -",
	title = "{U}se of provisional routes to speed-up change assimilation in {I}nfini{B}and networks",
	url = "http://dx.doi.org/10.1109/IPDPS.2004.1303199",
	year = 2004
}

Timothy Mark Pinkston, Bilal Zafar and Jose Duato. A method for applying double scheme dynamic reconfiguration over infiniBand. 2003, 793 - 800. BibTeX

@conference{ 2004148099398,
	author = "Timothy Mark Pinkston and Bilal Zafar and Duato, Jose",
	abstract = "InfiniBand Architecture is a newly established general-purpose interconnect standard applicable to local area, system area and storage area networking and I/O. Networks based on this standard should be capable of tolerating topological changes due to resource failures, link/switch activations, and/or hot swapping of components. In order to maintain connectivity, the network's routing function may need to be reconfigured on each topological change. Although the architecture has various mechanisms useful for configuring the network, no strategy or procedure is specified for ensuring deadlock freedom during dynamic network reconfiguration. In this paper, a method for applying the Double Scheme [1] over InfiniBand networks is proposed. The Double Scheme provides a systematic way of reconfiguring a network dynamically while ensuring freedom from deadlocks. We show how features and mechanisms available in InfiniBand Architecture for other purposes can also be used to implement dynamic network reconfiguration based on the Double Scheme. We also propose new mechanisms that may be considered in future versions of the spec for making dynamic reconfiguration and other subnet management operations more efficient.",
	address = "Las Vegas, NV, United states",
	journal = "Proceedings of the International Conference on Parallel and Distributed Processing Techniques and Applications",
	key = "Data communication systems",
	keywords = "Local area networks;Multiprocessing systems;Packet networks;Packet switching;Real time systems;Routers;Servers;",
	note = "Deadlock-free dynamic reconfiguration;Double scheme;Infiniband architecture;Routing;Storage area networking;System area networking;",
	pages = "793 - 800",
	title = "{A} method for applying double scheme dynamic reconfiguration over infini{B}and",
	volume = 2,
	year = 2003
}

O Lysne, T M Pinkston and Jose Duato. A methodology for developing dynamic network reconfiguration processes. 2003, 77 - 86. BibTeX

@conference{ 8301784,
	author = "O. Lysne and T.M. Pinkston and Duato, Jose",
	abstract = "Dynamic network reconfiguration is defined as the change from one routing function to another while the network is up and running. The main challenge is avoidance of deadlocks, while keeping restrictions on packet injection and forwarding minimal. Current approaches either require virtual channels in the network, or they work only for a limited set of routing algorithms. We present a methodology for devising deadlock free and dynamic transitions between an old and a new routing function. The methodology is independent of topology and puts no restrictions on either routing function. Furthermore, it does not require any virtual channels to guarantee deadlock freedom. This research is motivated by the current trend toward using increasingly larger Internet servers based on clusters of PCs and the very high availability requirements of those as well as other local, system, and storage area network-based systems",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings 2003 International Conference on Parallel Processing",
	keywords = "concurrency control;Internet;multiprocessor interconnection networks;telecommunication network routing;",
	note = "dynamic network reconfiguration;deadlock avoidance;virtual channels;network routing;Internet server;storage area network-based system;local area network;system area network;interconnection network architecture;",
	pages = "77 - 86",
	title = "{A} methodology for developing dynamic network reconfiguration processes",
	year = 2003
}

F J Alfaro, J L Sanchez and Jose Duato. A new proposal to fill in the InfiniBand arbitration tables. 2003, 133 - 40. BibTeX

@conference{ 8301790,
	author = "F.J. Alfaro and J.L. Sanchez and Duato, Jose",
	abstract = "The InfiniBand architecture (IBA) is a new industry-standard architecture for server I/O and interprocessor communication. InfiniBand is very likely to become the de facto standard in a few years. It is being developed by the InfiniBand^SM Trade Association (IBTA) to provide the levels of reliability, availability, performance, scalability, and quality of service (QoS) necessary for present and future server systems. We propose a simple and effective strategy for configuring the IBA networks to provide the required levels of QoS. This is a global frame that allows one to do a different treatment to each kind of traffic based on its QoS requirements. It is based on the correct configuration of the mechanisms IBA provides to support QoS. We also propose a simple algorithm to maximize the number of requests to be allocated in the arbitration table that the output ports have. This proposal is evaluated and the results show that every traffic class meets its QoS requirements",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings 2003 International Conference on Parallel Processing",
	keywords = "multiprocessor interconnection networks;parallel architectures;performance evaluation;quality of service;telecommunication network reliability;telecommunication traffic;",
	note = "InfiniBand arbitration table;InfiniBand architecture;server I/O;interprocessor communication;reliability;performance evaluation;scalability;quality of service;QoS;telecommunication traffic;",
	pages = "133 - 40",
	title = "{A} new proposal to fill in the {I}nfini{B}and arbitration tables",
	year = 2003
}

Xavier Molero, Vicente Santonja, J A Alegre and I Torregrosa. Adding instrumentation tools to the SMPL discrete-event simulation language. 2003, 28 - 37. BibTeX

@conference{ 8531137,
	author = "Molero, Xavier and Santonja, Vicente and J.A. Alegre and I. Torregrosa",
	abstract = "SMPL is a discrete-event simulation language used at the universities of many countries in order to teach simulation. This language is appropriate for beginners due to its simplicity and easy use. Until now, we have not found in the literature significant improvements to its implementation. In this work we have improved the modeling capabilities of SMPL by adding several instrumentation tools. They can be used to collect simulation variables such as queueing times, queueing lengths, or response times. These tools, implemented as a collection of procedures, helps programmers to improve clarity and readability of simulation programs. The way they can be used has been inspired on the instrumentation capabilities of the CSIM simulation language. In particular, we have added four main structures to collect system data: tables, qtables, boxes and meters. In this paper, the authors present a brief description of the implemented instrumentation tools and some aspects about their internal design. We also include a simple simulation example showing how to use them and how results are reported",
	address = "Ghent, Belgium",
	journal = "1st Industrial Simulation Conference 2003",
	keywords = "discrete event simulation;queueing theory;simulation languages;",
	note = "instrumentation tools;SMPL;discrete-event simulation language;queueing times;queueing lengths;response times;simple portable simulation language;tables;qtables;boxes;meters;",
	pages = "28 - 37",
	title = "{A}dding instrumentation tools to the {SMPL} discrete-event simulation language",
	year = 2003
}

P Morillo, J M Orduna, M Fernandez and Jose Duato. An adaptive load balancing technique for distributed virtual environment systems. 2003, 256 - 61. BibTeX

@conference{ 8116366,
	author = "P. Morillo and J.M. Orduna and M. Fernandez and Duato, Jose",
	abstract = "One of the key issues in the design of scalable and cost-effective distributed virtual environment (DVE) systems is the partitioning problem. This problem consists of efficiently assigning clients (3D avatars) to the servers in the system, and some methods have been already proposed for solving it. However, only one of these methods takes into account the nonlinear behavior of DVE servers with the number of avatars they support, and this method uses a load balancing technique of local scope. As a result, it only provides good performance if the movement pattern of avatars is uniform. In this paper, we propose an adaptive load balancing technique of global scope for solving the partitioning problem in DVE systems. The global scope of the proposed technique allows to avoid DVE saturation as long as possible. Evaluation results show that the proposed strategy can improve DVE system performance, regardless of both the movement patterns of avatars and also the initial distribution of avatars in the virtual world",
	address = "Anaheim, CA, USA",
	journal = "Proceedings of the Fifteenth IASTED Internation Conference on Parallel and Distributed Computing and Systems",
	keywords = "distributed processing;resource allocation;virtual reality;",
	note = "adaptive load balancing;distributed virtual environment;3D avatars;virtual world;dynamic partitioning;",
	pages = "256 - 61",
	title = "{A}n adaptive load balancing technique for distributed virtual environment systems",
	volume = "vol. 1",
	year = 2003
}

P Morillo, J M Orduna, M Fernandez and Jose Duato. An adaptive load balancing technique for distributed virtual environment systems. 2003, 256 - 261. BibTeX

@conference{ 2004138084519,
	author = "P. Morillo and J.M. Orduna and M. Fernandez and Duato, Jose",
	abstract = "One of the key issues in the design of scalable and cost-effective Distributed Virtual Environment (DVE) systems is the partitioning problem. This problem consists of efficiently assigning clients (3-D avatars) to the servers in the system, and some methods have been already proposed for solving it. However, only one of these methods takes into account the non-linear behavior of DVE servers with the number of avatars they support, and this method uses a load balancing technique of local scope. As a result, it only provides good performance if the movement pattern of avatars is uniform. In this paper, we propose an adaptive load balancing technique of global scope for solving the partitioning problem in DVE systems. The global scope of the proposed technique allows to avoid DVE saturation as long as possible. Evaluation results show that the proposed strategy can improve DVE system performance, regardless of both the movement patterns of avatars and also the initial distribution of avatars in the virtual world.",
	address = "Marina del Rey, CA, United states",
	journal = "Proceedings of the IASTED International Conference on Parallel and Distributed Computing and Systems",
	key = "Client server computer systems",
	keywords = "Adaptive control systems;Bandwidth;Computer aided instruction;Computer supported cooperative work;Virtual reality;",
	note = "Distributed virtual environments;Dynamic partitioning;Load balancing;",
	number = 1,
	pages = "256 - 261",
	title = "{A}n adaptive load balancing technique for distributed virtual environment systems",
	volume = 15,
	year = 2003
}

Jose Flich, Pedro Lopez, M P Malumbres, Jose Duato and T Rokicki. Applying in-transit buffers to boost the performance of networks with source routing. Computers, IEEE Transactions on 52(9):1134 - 1153, 2003. DOI BibTeX

@article{ 1228510,
author = "Flich, Jose and Lopez, Pedro and M.P. Malumbres and Duato, Jose and T. Rokicki",
abstract = "In this paper, we analyze in depth the effect of using ITB in the network, showing that they not only serve for guaranteeing minimal routing, but also that they are a powerful mechanism able to balance network traffic and reduce network contention. To demonstrate these capabilities, we apply the ITB mechanism to improved routing schemes, such as DFS and smart-routing. These routing algorithms (without ITB) are able to improve the performance of up*/down* by 30 percent and 90 percent, respectively, for a 32-switch network. The evaluation results show that, when ITB are used together with these improved routing algorithms, network throughput achieved by DFS and smart-routing can still be improved by 56 percent and 23 percent, respectively. However, smart-routing requires a time to compute the routing tables that rapidly grows with network size, it being impossible in practice to build networks with more than 32 switches. This high computational cost is mainly motivated by the need of obtaining deadlock-free routing tables. However, when ITB are used, one can decouple the stages of computing routing tables and breaking cycles. Moreover, as stated above, ITB can be used to reduce network contention. In this way, in this paper, we also propose a completely new routing algorithm that tries to balance network traffic by using a simple and low time consuming strategy. The proposed algorithm guarantees deadlock freedom and reduces network contention with the use of ITB. The evaluation results show that our algorithm obtains unprecedented throughputs in 32-switch networks, tripling the original up*/down* and almost doubling smart-routing.",
doi = "10.1109/TC.2003.1228510",
issn = "0018-9340",
journal = "Computers, IEEE Transactions on",
keywords = "32-switch network; DFS; ITB; NOW; breaking cycles; deadlock-free routing tables; in-transit buffers; minimal routing; network contention reduction; network performance; network throughput; network traffic balancing; networks of workstations; performance;",
month = "sept.",
number = 9,
pages = "1134 - 1153",
title = "{A}pplying in-transit buffers to boost the performance of networks with source routing",
volume = 52,
year = 2003
}

B Caminero, C Carrion, F J Quiles, Jose Duato and S Yalamanchili. {A. 220 - 226. BibTeX

@conference{ 2004148099314,
author = "B. Caminero and C. Carrion and F.J. Quiles and Duato, Jose and S. Yalamanchili",
abstract = "Over the past few years a dramatical increase in the use of multimedia applications has taken place, due mainly to the availability of fast processors and sophisticated peripherals at a low cost. Many of these applications either are inherently distributed, or need the resources of a cluster of computers. The traffic generated by multimedia applications has very different requirements than the best-effort traffic generated by conventional applications. Now, the network must deliver some sort of Quality of Service (QoS) to the flows that require it. The Multimedia Router (MMR) arises as a solution for providing such QoS support within a compact interconnection element, aimed for use in local and clustered environments. The particular needs for every kind of traffic, both multimedia and best-effort, are addressed in order to provide the multimedia flows the QoS guarantees they need, while still achieving high link utilizations. In this work, the Multimedia Router architecture is described, and some insight is given on its performance, specially regarding the interaction between buffer size and the algorithms used for link scheduling.",
address = "Las Vegas, NV, United states",
journal = "Proceedings of the International Conference on Parallel and Distributed Processing Techniques and Applications",
key = "Data communication systems",
keywords = "Distributed computer systems;Interconnection networks;Parallel algorithms;Quality of service;Routers;Telecommunication traffic;",
note = "Link-switch scheduling;Multimedia communication;Router architecture;",
pages = "220 - 226",
title = "{A"
}

B Caminero, C Carrion, F J Quiles, Jose Duato and S Yalamanchili. {A. 8 pp. -. BibTeX

@conference{ 7891389,
	author = "B. Caminero and C. Carrion and F.J. Quiles and Duato, Jose and S. Yalamanchili",
	abstract = "The primary objective of the MultiMedia Router (MMR) project is the design and implementation of a compact router optimized for multimedia applications. The router is targeted for use in cluster and LAN interconnection networks, which offer different constraints and therefore differing router solutions than WANs. One of the key elements within the router are the algorithms used to decide the forwarding order of the information that traverses it: the link and switch scheduling algorithms. They help greatly to determine the QoS guarantees delivered to the application flows. Also, conventional best-effort traffic should be seamlessly integrated by scheduling algorithms, in such a way that link bandwidth is efficiently used, but without degrading the QoS guarantees of the multimedia connections. In this paper, two solutions for switch scheduling are thoroughly evaluated with mixed workloads (i.e., composed of multimedia and best-effort traffic), and their is performance compared to another well-known approach for switch scheduling, that does not consider QoS requirements when performing scheduling decisions. Results show that, when a QoS-aware switch scheduler is used, the QoS received by the multimedia flows is not affected by the presence of best-effort traffic",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings International Parallel and Distributed Processing Symposium",
	keywords = "LAN interconnection;multimedia communication;performance evaluation;quality of service;telecommunication network routing;wide area networks;workstation clusters;",
	note = "hybrid traffic handling;clustered environments;multimedia router MMR;LAN interconnection networks;cluster networks;WANs;QoS guarantees;link bandwidth;",
	pages = "8 pp. -",
	title = "{A"
}

Timothy Mark Pinkston, Ruoming Pang and Jose Duato. Deadlock-free dynamic reconfiguration schemes for increased network dependability. IEEE Transactions on Parallel and Distributed Systems 14(8):780 - 794, 2003. URL BibTeX

@article{ 2003407655843,
	author = "Timothy Mark Pinkston and Ruoming Pang and Duato, Jose",
	abstract = "Network-based parallel computing systems often require the ability to reconfigure the routing algorithm to reflect changes in network topology if and when voluntary or involuntary changes occur. The process of reconfiguring a network's routing capabilities may be very inefficient and/or deadlock-prone if not handled properly. In this paper, we propose efficient and deadlock-free dynamic reconfiguration schemes that are applicable to routing algorithms and networks which use wormhole, virtual cut-through, or store-and-forward switching, combined with hard link-level flow control. One requirement is that the network architecture use virtual channels or duplicate physical channels for deadlock-handling as well as performance purposes. The proposed schemes do not impede the injection, transmission, or delivery of user packets during the reconfiguration process. Instead, they provide uninterrupted service, increased availability/reliability, and improved overall quality-of-service support as compared to traditional techniques based on static reconfiguration.",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Parallel processing systems",
	keywords = "Algorithms;Communication channels (information theory);Congestion control (communication);Dynamic programming;Interconnection networks;Packet networks;Quality of service;Requirements engineering;Switching networks;Virtual reality;",
	note = "Deadlock free dynamic reconfiguration;Hard link level flow control;Routing algorithm;Virtual channels;",
	number = 8,
	pages = "780 - 794",
	title = "{D}eadlock-free dynamic reconfiguration schemes for increased network dependability",
	url = "http://dx.doi.org/10.1109/TPDS.2003.1225057",
	volume = 14,
	year = 2003
}

A Bermudez, R Casado, F J Quiles, T M Pinkston and Jose Duato. Evaluation of a subnet management mechanism for InfiniBand networks. 2003, 117 - 24. BibTeX

@conference{ 8301788,
	author = "A. Bermudez and R. Casado and F.J. Quiles and T.M. Pinkston and Duato, Jose",
	abstract = "The InfiniBand architecture is a high-performance network technology for the interconnection of processor nodes and I/O devices using a point-to-point switch-based fabric. The InfiniBand specification defines a basic management infrastructure that is responsible for subnet configuration, activation, and fault tolerance. Subnet management entities and functions are described, but the specifications do not impose any particular implementation. We present and analyze a complete subnet management mechanism for this architecture. We allow to anticipate future directions to obtain efficient management protocols",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings 2003 International Conference on Parallel Processing",
	keywords = "computer network management;fault tolerant computing;multiprocessor interconnection networks;parallel architectures;protocols;",
	note = "subnet management mechanism;InfiniBand networks;InfiniBand architecture;point-to-point switch-based fabric;management infrastructure;subnet configuration;fault tolerance;",
	pages = "117 - 24",
	title = "{E}valuation of a subnet management mechanism for {I}nfini{B}and networks",
	year = 2003
}

Xavier Molero, Vicente Santonja, I Torregrosa and J A Alegre. Extending the capabilities of the SMPL discrete-event simulation language. 2003, 19 - 23. BibTeX

@conference{ 8587783,
	author = "Molero, Xavier and Santonja, Vicente and I. Torregrosa and J.A. Alegre",
	abstract = "SMPL is an open-source simulation language used at universities of many countries both for research activities and in discrete simulation teaching. Despite of that, until now we have not found in the literature significant improvements to its original capabilities. In a previous work we proposed the addition of several instrumentation tools based on those proposed in the CSIM language. In this work we have improved the modeling capabilities of SMPL by adding several features. First of all, we have added facilities with infinite servers to model delays, and most important, customer classes can be defined in the model; the inclusion of classes can be used in order to make explicit the behavior of customers in the model and also to get statistics for each class in reports. Second, we have included explicit queues, storages, and an unqueue function, for both facilities and explicit queues, initially proposed by the creator of SMPL as a natural extension. Finally, the random number generator has been updated to that proposed by L'Ecuyer in order to avoid hardware dependency. In this paper, we present a brief description of the new features added to the SMPL simulation language, the way they can be used in the model code, and some aspects about their internal implementation",
	address = "Ghent, Belgium",
	journal = "The European Simulation and Modelling Conference 2003",
	keywords = "discrete event simulation;queueing theory;random number generation;simulation languages;",
	note = "SMPL discrete-event simulation language;open-source simulation language;discrete simulation teaching;CSIM language;random number generator;model code;",
	pages = "19 - 23",
	title = "{E}xtending the capabilities of the {SMPL} discrete-event simulation language",
	year = 2003
}

J M M Rubio, Pedro Lopez and Jose Duato. FC3D: flow control-based distributed deadlock detection mechanism for true fully adaptive routing in wormhole networks. Parallel and Distributed Systems, IEEE Transactions on 14(8):765 - 779, 2003. URL, DOI BibTeX

@article{ 1225056,
	author = "J.M.M. Rubio and Lopez, Pedro and Duato, Jose",
	abstract = "Two general approaches have been proposed for deadlock handling in wormhole networks. Traditionally, deadlock-avoidance strategies have been used. In this case, either routing is restricted so that there are no cyclic dependencies between channels or cyclic dependencies between channels are allowed provided that there are some escape paths to avoid deadlock. More recently, deadlock recovery strategies have begun to gain acceptance. These strategies allow the use of unrestricted fully adaptive routing, usually outperforming deadlock avoidance techniques. However, they require a deadlock detection mechanism and a deadlock recovery mechanism that is able to recover from deadlocks faster than they occur. In particular, progressive deadlock recovery techniques are very attractive because they allocate a few dedicated resources to quickly deliver deadlocked messages, instead of killing them. Unfortunately, distributed deadlock detection is usually based on crude time-outs, which detect many false deadlocks. As a consequence, messages detected as deadlocked may saturate the bandwidth offered by recovery resources, thus degrading performance. Additionally, the threshold required by the detection mechanism (the time-out) strongly depends on network load, which is not known in advance at the design stage. This limits the applicability of deadlock recovery on actual networks. We propose a novel distributed deadlock detection mechanism that uses only local information, detects all the deadlocks, considerably reduces the probability of false deadlock detection over previously proposed techniques, and is not significantly affected by variations in message length and/or message destination distribution.",
	doi = "10.1109/TPDS.2003.1225056",
	issn = "1045-9219",
	journal = "Parallel and Distributed Systems, IEEE Transactions on",
	keywords = "FC3D mechanism; crude time-out; deadlock detection mechanism; deadlock-avoidance strategy; deadlocked message; false deadlock detection probability; flow control-based distributed deadlock detection; message destination distribution; message length distr",
	month = "aug.",
	number = 8,
	pages = "765 - 779",
	title = "{FC}3{D}: flow control-based distributed deadlock detection mechanism for true fully adaptive routing in wormhole networks",
	url = "http://dx.doi.org/10.1109/TPDS.2003.1225056",
	volume = 14,
	year = 2003
}

Juan-Miguel Martinez Rubio, Pedro Lopez and Jose Duato. FC3D: Flow control-based distributed deadlock detection mechanism for true fully adaptive routing in wormhole networks. IEEE Transactions on Parallel and Distributed Systems 14(8):765 - 779, 2003. URL BibTeX

@article{ 2003407655842,
	author = "Juan-Miguel Martinez Rubio and Lopez, Pedro and Duato, Jose",
	abstract = "Two general approaches have been proposed for deadlock handling in wormhole networks. Traditionally, deadlock avoidance strategies have been used. In this case, either routing is restricted so that there are no cyclic dependencies between channels or cyclic dependencies between channels are allowed provided that there are some escape paths to avoid deadlock. More recently, deadlock recovery strategies have begun to gain acceptance. These strategies allow the use of unrestricted fully adaptive routing, usually outperforming deadlock avoidance techniques. However, they require a deadlock detection mechanism and a deadlock recovery mechanism that is able to recover from deadlocks faster than they occur. In particular, progressive deadlock recovery techniques are very attractive because they allocate a few dedicated resources to quickly deliver deadlocked messages, instead of killing them. Unfortunately, distributed deadlock detection is usually based on crude time-outs, which detect many false deadlocks. As a consequence, messages detected as deadlocked may saturate the bandwidth offered by recovery resources, thus degrading performance. Additionally, the threshold required by the detection mechanism (the time-out) strongly depends on network load, which is not known in advance at the design stage. This limits the applicability of deadlock recovery on actual networks. In this paper, we propose a novel distributed deadlock detection mechanism that uses only local information, detects all the deadlocks, considerably reduces the probability of false deadlock detection over previously proposed techniques, and is not significantly affected by variations in message length and/or message destination distribution.",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Distributed computer systems",
	keywords = "Adaptive control systems;Command and control systems;Congestion control (communication);Data communication systems;Probability distributions;Requirements engineering;Resource allocation;",
	note = "Adaptive routing;Deadlock recovery;Distributed deadlock detection;Wormhole networks;",
	number = 8,
	pages = "765 - 779",
	title = "{FC}3{D}: {F}low control-based distributed deadlock detection mechanism for true fully adaptive routing in wormhole networks",
	url = "http://dx.doi.org/10.1109/TPDS.2003.1225056",
	volume = 14,
	year = 2003
}

A Bermudez, R Casado, F J Quiles, T M Pinkston and Jose Duato. On the Infiniband subnet discovery process. 2003, 512 - 17. URL BibTeX

@conference{ 7962798,
	author = "A. Bermudez and R. Casado and F.J. Quiles and T.M. Pinkston and Duato, Jose",
	abstract = "InfiniBand is becoming an industry standard both for communication between processing nodes and I/O devices, and for interprocessor communication. Instead of using a shared bus, InfiniBand employs an arbitrary (possibly irregular) switched point-to-point network. InfiniBand specification defines a basic management infrastructure that is responsible for subnet configuration, activation, and fault tolerance. After the detection of a topology change, management entities collect the current subnet topology. The topology discovery algorithm is one of the management issues that are outside the scope of the current specification. Preliminary implementations obtain the entire topological information each time a change is detected. In this work, we present and analyze an optimized implementation, based on exploring only the region that has been affected by the change",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings. IEEE International Conference on Cluster Computing",
	keywords = "communication complexity;computer communications software;computer network management;data communication;fault tolerant computing;local area networks;message passing;network operating systems;optimisation;telecommunication network routing;telecommunicatio",
	note = "Infiniband;subnet discovery process;processing nodes;I/O devices;interprocessor communication;shared bus;arbitrary switched point-to-point network;basic management infrastructure;subnet configuration;subnet activation;fault tolerance;subnet topology;topology discovery algorithm;",
	pages = "512 - 17",
	title = "{O}n the {I}nfiniband subnet discovery process",
	url = "http://dx.doi.org/10.1109/CLUSTR.2003.1253361",
	year = 2003
}

Eun Jung Kim, Ki Hwan Yum, C R Das, M Yousif and Jose Duato. Performance enhancement techniques for InfiniBandTM Architecture. 2003, 253 - 62. URL BibTeX

@conference{ 7703806,
	author = "Eun Jung Kim and Ki Hwan Yum and C.R. Das and M. Yousif and Duato, Jose",
	abstract = "The InfiniBand^TM Architecture (IBA) is envisioned to be the default communication fabric for future system area networks (SAN). However, the released IBA specification outlines only higher level functionalities, leaving it open for exploring various design alternatives. In this paper we investigate four co-related techniques to provide high and predictable performance in IBA. These are: (i) using the shortest path first (SPF) algorithm for deterministic packet routing; (ii) developing a multipath routing mechanism for minimizing congestion; (iii) developing a selective packet dropping scheme to handle deadlock and congestion; and (iv) providing multicasting support for customized applications. These designs are evaluated using an integrated workload on a versatile IBA simulation testbed. Simulation results indicate that the SPF routing, multipath routing, packet dropping, and multicasting schemes are quite effective in delivering high and assured performance in clusters. One of the major contributions of this research is the IBA simulation testbed, which is an essential tool to evaluate various design tradeoffs",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings the Ninth International Symposium on High-Performance Computer Architecture. HPCA-9 2003",
	keywords = "concurrency control;deterministic algorithms;local area networks;multicast communication;packet switching;performance evaluation;telecommunication congestion control;telecommunication network routing;",
	note = "performance enhancement techniques;InfiniBand Architecture;system area networks;SAN;IBA specification;shortest path first algorithm;SPF algorithm;deterministic packet routing;multipath routing mechanism;congestion minimization;selective packet dropping;deadlock;multicasting support;customized applications;integrated workload;simulation testbed;clusters;design tradeoffs;",
	pages = "253 - 62",
	title = "{P}erformance enhancement techniques for {I}nfini{B}and{TM} {A}rchitecture",
	url = "http://dx.doi.org/10.1109/HPCA.2003.1183543",
	year = 2003
}

J C Sancho, Juan Carlos Martinez, Antonio Robles, Pedro Lopez, Jose Flich and Jose Duato. Performance evaluation of COWS under real parallel applications. In Parallel and Distributed Processing Symposium, 2003. Proceedings. International. 2003, 10 pp.. DOI BibTeX

@conference{ 1213371,
	author = "J.C. Sancho and Martinez, Juan Carlos and Robles, Antonio and Lopez, Pedro and Flich, Jose and Duato, Jose",
	abstract = "Clusters of workstations (COWS) are often arranged as a switch-based network with irregular topology. Usually, the evaluation of interconnection networks for COWS has been carried out by simulation using synthetic traffic and by traces from real parallel applications. Although both types of traffics are used as a first approximation of the behavior of the system, a more accurate behavior can be obtained by using real parallel applications. In this paper, a new simulation framework has been developed in order to evaluate interconnection networks under real parallel applications by using an execution-driven simulator. Moreover, the new simulator can be used to evaluate the impact on the performance of the whole system of several design parameters in addition to the interconnection network. Evaluation results show that the execution time of real parallel applications can be reduced by using an effective routing algorithm. Moreover, in some cases, the achieved improvements are higher than the ones achieved by improving other design issues, such as the processor instruction issue rate, the cache size or the network bandwidth.",
	booktitle = "Parallel and Distributed Processing Symposium, 2003. Proceedings. International",
	doi = "10.1109/IPDPS.2003.1213371",
	issn = "1530-2075",
	keywords = "COWS; cache size; clusters of workstations; execution-driven simulator; interconnection networks; network bandwidth; performance evaluation; processor instruction issue rate; simulation framework; switch-based network; discrete event simulation; performa",
	month = "22-26",
	pages = "10 pp.",
	title = "{P}erformance evaluation of {COWS} under real parallel applications",
	year = 2003
}

F J Alfaro, J L Sanchez, Luis Orozco and Jose Duato. Providing QoS in InfiniBand for Regular and Irregular Topologies. 2003, 1079 - 1082. URL BibTeX

@conference{ 2003407661216,
	author = "F.J. Alfaro and J.L. Sanchez and Luis Orozco and Duato, Jose",
	abstract = "The InfiniBand Architecture (IBA) is becoming an industry standard for communication between processing nodes and I/O devices or for interprocessor communication. It is being developed by the InfiniBand^SM Trade Association (IBTA) to provide the levels of reliability, availability, performance, scalability, and quality of service (QoS) necessary for present and future server systems. In [1] we proposed a new strategy to address these issues. We have evaluated this new strategy only for irregular topology networks [4]. In this paper we evaluate our proposal for regular topologies (hypercube and mesh) and we compare the. results obtained. In this way, we want to study the influence of the topology on the QoS mechanisms.",
	address = "Montreal, Canada",
	issn = 08407789,
	journal = "Canadian Conference on Electrical and Computer Engineering",
	key = "Communication systems",
	keywords = "Computer simulation;Quality of service;Telecommunication links;Topology;",
	note = "Interprocessor communication;",
	pages = "1079 - 1082",
	title = "{P}roviding {Q}o{S} in {I}nfini{B}and for {R}egular and {I}rregular {T}opologies",
	url = "http://dx.doi.org/10.1109/CCECE.2003.1226083",
	volume = 2,
	year = 2003
}

JC Sancho, Antonio Robles, Pedro Lopez, Jose Flich and Jose Duato. Routing in InfiniBand (TM) torus network topologies. In P Sadayappan and CS Yang (eds.). 2003 INTERNATIONAL CONFERENCE ON PARALLEL PROCESSING, PROCEEDINGS. 2003, 509-518. BibTeX

@conference{ isi:000186828800056,
	author = "JC Sancho and Robles, Antonio and Lopez, Pedro and Flich, Jose and Duato, Jose",
	abstract = "InfiniBand is an interconnect standard for communication between processing nodes and I/O devices as well as for interprocessor communication (NOWs). The InfiniBand Architecture (IBA) defines a switch-based network with point-to-point links whose topology can be established by the customer When the performance is the primary concern regular topologies are preferred. Low-dimensional tori (2D and 3D) are some of the regular topologies most widely used in commercial parallel computers. Routing in torus requires the use of virtual channels. Although InfiniBand provides support for deterministic routing and virtual channels, they are selected at each switch by service level (SL) identifiers associated to packets and do not depend on packet destination. This makes routing algorithm implementation more complex. In particular, a large number of SLs may be required, which is a scarce resource. In this paper we analyze the way several routing strategies can be applied in tori InfiniBand networks, also evaluating their resource requirements. In particular, we analyze and compare the well-known e-cube and up{*}/down{*} routing algorithms and the Flexible routing algorithm recently proposed.",
	booktitle = "2003 INTERNATIONAL CONFERENCE ON PARALLEL PROCESSING, PROCEEDINGS",
	editor = "Sadayappan, P and Yang, CS",
	isbn = 0769520170,
	note = "International Conference on Parallel Processing, KAOHSIUNG, TAIWAN, OCT 06-09, 2003",
	pages = "509-518",
	title = "{R}outing in {I}nfini{B}and ({TM}) torus network topologies",
	year = 2003
}

Salvador Coll, Jose Duato, F Petrini and F J Mora. Scalable Hardware-Based Multicast Trees. In Supercomputing, 2003 ACM/IEEE Conference. 2003, 54 - 54. URL, DOI BibTeX

@conference{ 1592957,
author = "Coll, Salvador and Duato, Jose and F. Petrini and F.J. Mora",
abstract = "This paper presents an algorithm for implementing optimal hardware-based multicast trees, on networks that provide hardware support for collective communication. Although the proposed methodology can be generalized to a wide class of networks, we apply our methodology to the Quadrics network, a state-of-the-art network that provides hardware-based multicast communication. The proposed mechanism is intended to improve the performance of the collective communication patterns on the network, in those cases where the hardware support can not be directly used, for instance, due to some faulty nodes. This scheme provides significant reduction on multicast latencies compared to the original system primitives, which use multicast trees based on unicast communication. A backtracking algorithm to find the optimal solution to the problem is presented. In addition, a greedy algorithm is presented and shown to provide near optimal solutions. Finally, our experimental results show the good performance and scalability of the proposed multicast tree in comparison to the traditional unicast-based multicast trees. Our multicast mechanism doubles barrier synchronization and broadcasts performance when compared to the production-level MPI library.",
booktitle = "Supercomputing, 2003 ACM/IEEE Conference",
doi = "10.1109/SC.2003.10058",
isbn = "1-58113-695-1",
month = "nov.",
pages = "54 - 54",
publisher = "IEEE Computer Society",
title = "{S}calable {H}ardware-{B}ased {M}ulticast {T}rees",
url = "http://doi.ieeecomputersociety.org/10.1109/SC.2003.10058",
year = 2003
}

Juan Carlos Martinez, Jose Flich, Antonio Robles, Pedro Lopez and Jose Duato. Supporting adaptive routing in IBA switches. 2003, 441 - 456. URL BibTeX

@conference{ 2003487758791,
author = "Martinez, Juan Carlos and Flich, Jose and Robles, Antonio and Lopez, Pedro and Duato, Jose",
abstract = "InfiniBand is a new standard for communication between processing nodes and I/O devices as well as for interprocessor communication. The InfiniBand Architecture (IBA) supports distributed deterministic routing because forwarding tables store a single output port per destination ID. This prevents packets from using alternative paths when the requested output port is busy. Despite the fact that alternative paths could be selected at the source node to reach the same destination node, this is not effective enough to improve network performance. However, using adaptive routing could help to circumvent the congested areas in the network, leading to an increment in performance. In this paper, we propose a simple strategy to implement forwarding tables for IBA switches that supports adaptive routing while still maintaining compatibility with the IBA specs. Adaptive routing can be individually enabled or disabled for each packet at the source node. The proposed strategy enables the use in IBA of any adaptive routing algorithm with an acyclic channel dependence graph. In this paper, we have taken advantage of the partial adaptivity provided by the well-known up*/down* routing algorithm. Evaluation results show that extending IBA switch capabilities with adaptive routing may noticeably increase network performance. In particular, network throughput improvement can be, on average, as high as 66%. © 2003 Elsevier B.V. All rights reserved.",
issn = 13837621,
journal = "Journal of Systems Architecture",
key = "Systems engineering",
keywords = "Algorithms;Communication;Information technology;Switches;Telecommunication networks;",
note = "Adaptive routing;",
number = "10-11",
pages = "441 - 456",
title = "{S}upporting adaptive routing in {IBA} switches",
url = "http://dx.doi.org/10.1016/S1383-7621(03)00103-6",
volume = 49,
year = 2003
}

Juan Carlos Martinez, Jose Flich, Antonio Robles, Pedro Lopez and Jose Duato. Supporting adaptive routing in InfiniBand networks. In Parallel, Distributed and Network-Based Processing, 2003. Proceedings. Eleventh Euromicro Conference on. 2003, 165 - 172. URL, DOI BibTeX

@conference{ 1183583,
author = "Martinez, Juan Carlos and Flich, Jose and Robles, Antonio and Lopez, Pedro and Duato, Jose",
abstract = "InfiniBand is a new standard for communication between processing nodes and I/O devices as well as for interprocessor communication. The InfiniBand Architecture (IBA) supports distributed deterministic routing because forwarding tables store a single output port per destination ID. This prevents packets from using alternative paths when the requested output port is busy. Despite the fact that alternative paths could be selected at the source node to reach the same destination node, this is not effective enough to improve network performance. However using adaptive routing could help to circumvent the congested areas in the network, leading to an increment in performance. In this paper we propose a simple strategy to implement forwarding tables for IBA switches that supports adaptive routing while still maintaining compatibility with the IBA specifications. Adaptive routing can be individually enabled or disabled for each packet at the source node. The proposed strategy enables the use in IBA of any adaptive routing algorithm with an acyclic channel dependence graph. In this paper, we have taken advantage of the partial adaptivity provided by the well-known up*/down* routing algorithm. Evaluation results show that extending IBA switch capabilities with adaptive routing may noticeably increase network performance. In particular network throughput improvement can be, on average, as high as 46%.",
booktitle = "Parallel, Distributed and Network-Based Processing, 2003. Proceedings. Eleventh Euromicro Conference on",
doi = "10.1109/EMPDP.2003.1183583",
issn = "1066-6192",
keywords = "I-O devices; IBA switches; InfiniBand Architecture; InfiniBand networks; acyclic channel dependence graph; adaptive routing; deterministic routing; forwarding tables; interprocessor communication; network performance; network throughput; processing node",
month = "feb.",
pages = "165 - 172",
title = "{S}upporting adaptive routing in {I}nfini{B}and networks",
url = "http://dx.doi.org/10.1109/EMPDP.2003.1183583",
year = 2003
}

Juan Carlos Martinez, Jose Flich, Antonio Robles, Pedro Lopez and Jose Duato. Supporting fully adaptive routing in InfiniBand networks. In Parallel and Distributed Processing Symposium, 2003. Proceedings. International. April 2003, 10 pp.. URL, DOI BibTeX

@conference{ 1213130,
author = "Martinez, Juan Carlos and Flich, Jose and Robles, Antonio and Lopez, Pedro and Duato, Jose",
abstract = "InfiniBand is a new standard for communication between processing nodes and I/O devices as well as for interprocessor communication. The InfiniBand Architecture (IBA) supports distributed routing. However, routing in IBA is deterministic because forwarding tables store a single output port per destination ID. This prevents packets from using alternative paths when the requested output port is busy. Despite the fact that alternative paths could be selected at the source node to reach the same destination node, this is not effective enough to improve network performance. However, using adaptive routing could help to circumvent the congested areas in the network, leading to an increment in performance. In this paper, we propose a simple strategy to implement forwarding tables for IBA switches that support adaptive routing while still maintaining compatibility with the IBA specs. Adaptive routing can be enabled or disabled individually for each packet at the source node. Also, the proposed strategy enables the use in IBA of fully adaptive routing algorithms without using additional network resources to improve network performance. Evaluation results show that extending IBA switch capabilities with fully adaptive routing noticeably increases network performance. In particular, network throughput increases up to an average factor of 3.9.",
booktitle = "Parallel and Distributed Processing Symposium, 2003. Proceedings. International",
doi = "10.1109/IPDPS.2003.1213130",
issn = "1530-2075",
keywords = "InfiniBand networks; distributed routing; fully adaptive routing; interprocessor communication; network performance; network throughput; processing nodes; computer networks; multiprocessor interconnection networks; performance evaluation;",
month = "april",
pages = "10 pp.",
title = "{S}upporting fully adaptive routing in {I}nfini{B}and networks",
url = "http://dx.doi.org/10.1109/IPDPS.2003.1213130",
year = 2003
}

Juan Carlos Martinez, Jose Flich, Antonio Robles, Pedro Lopez and Jose Duato. Supporting fully adaptive routing in InfiniBand networks. 2003, 10 pp. -. URL BibTeX

@conference{ 7891311,
author = "Martinez, Juan Carlos and Flich, Jose and Robles, Antonio and Lopez, Pedro and Duato, Jose",
abstract = "InfiniBand is a new standard for communication between processing nodes and I/O devices as well as for interprocessor communication. The InfiniBand Architecture (IBA) supports distributed routing. However, routing in IBA is deterministic because forwarding tables store a single output port per destination ID. This prevents packets from using alternative paths when the requested output port is busy. Despite the fact that alternative paths could be selected at the source node to reach the same destination node, this is not effective enough to improve network performance. However, using adaptive routing could help to circumvent the congested areas in the network, leading to an increment in performance. In this paper, we propose a simple strategy to implement forwarding tables for IBA switches that support adaptive routing while still maintaining compatibility with the IBA specs. Adaptive routing can be enabled or disabled individually for each packet at the source node. Also, the proposed strategy enables the use in IBA of fully adaptive routing algorithms without using additional network resources to improve network performance. Evaluation results show that extending IBA switch capabilities with fully adaptive routing noticeably increases network performance. In particular, network throughput increases up to an average factor of 3.9",
address = "Los Alamitos, CA, USA",
journal = "Proceedings International Parallel and Distributed Processing Symposium",
keywords = "computer networks;multiprocessor interconnection networks;performance evaluation;",
note = "fully adaptive routing;InfiniBand networks;processing nodes;interprocessor communication;distributed routing;network performance;network throughput;",
pages = "10 pp. -",
title = "{S}upporting fully adaptive routing in {I}nfini{B}and networks",
url = "http://dx.doi.org/10.1109/IPDPS.2003.1213130",
year = 2003
}

Maria E Gomez, Jose Flich, Antonio Robles, Pedro Lopez and Jose Duato. VOQSW: a methodology to reduce HOL blocking in InfiniBand networks. In Parallel and Distributed Processing Symposium, 2003. Proceedings. International. 2003, 10 pp.. DOI BibTeX

@conference{ 1213134,
	author = "Gomez, Maria E. and Flich, Jose and Robles, Antonio and Lopez, Pedro and Duato, Jose",
	abstract = "InfiniBand is a new switch-based standard interconnect for communication between processor nodes and I/O devices as well as for interprocessor communication. InfiniBand architecture allows switches to support up to 15 virtual lanes per port for data traffic. To route packets through a given virtual lane (VL), packets are labeled with a certain service level (SL) at injection time, and SLtoVL mapping tables are used at each switch to determine the VL to be used. Many previous works in the literature have shown that separate virtual lanes are able to reduce the influence of the well-known head-of-line (HOL) blocking effect on network performance. However, using virtual lanes to form separate virtual networks is not enough to eliminate the HOL blocking problem. Alternative solutions such as Virtual Output Queuing (VOQ) are able to eliminate it at the expense of modifying the switch buffer organization. In this paper, we propose an effective strategy to implement the VOQ scheme in IBA switches by using virtual lanes. This strategy does not require to modify the switch architecture, simply SL to VL tables must be properly filled. Evaluation results show that our proposed VOQ scheme is able to outperform the results obtained with the virtual network approach using the same number of resources. Moreover, the methodology proposed to implement the VOQ scheme in IBA only requires a small number of resources in order to significantly improve network throughput.",
	booktitle = "Parallel and Distributed Processing Symposium, 2003. Proceedings. International",
	doi = "10.1109/IPDPS.2003.1213134",
	keywords = "HOL blocking; InfiniBand networks; SL to VL mapping tables; head-of-line blocking effect; interprocessor communication; network performance; network throughput; switch buffer organization; switch-based standard interconnect; virtual lane; virtual output",
	month = "22-26",
	pages = "10 pp.",
	title = "{VOQSW}: a methodology to reduce {HOL} blocking in {I}nfini{B}and networks",
	year = 2003
}

J M Orduna, Federico Silla and Jose Duato. A clustering method for modeling the communication requirements of message-passing applications. Computing and Informatics 21(1):1 - 16, 2002. BibTeX

@article{ 7407405,
	author = "J.M. Orduna and Silla, Federico and Duato, Jose",
	abstract = "Clusters have become a very cost-effective platform for high-performance computing. Usually these systems become heterogeneous as they grow, due to their incremental capabilities. Many research activities have focused on the problem of task scheduling in heterogeneous systems from the computational point of view. However, an ideal scheduling strategy would also take into account the communication requirements of the applications and the communication bandwidth available in the network. One of the key issues in this strategy is the measurement of the communication requirements for each application. We propose a clustering-based method to characterize the communications between processes generated by message-passing applications. This technique provides a model consisting of several partitions of the processes generated by the application. Also, we propose a criterion to measure the quality of the obtained partitions. This approach can be used when a given application is repeatedly executed with different input data. Results show that the proposed method can provide a partition with the highest ratio between the intracluster and the intercluster required communication bandwidth. This partition can be used to map groups of processes to processors in the heterogeneous system",
	address = "Slovakia",
	issn = "0232-0274",
	journal = "Computing and Informatics",
	keywords = "message passing;performance evaluation;resource allocation;scheduling;workstation clusters;",
	note = "clustering method;communication requirements;message-passing applications;cost-effective;high-performance computing;task scheduling;heterogeneous systems;interconnection networks;cluster computing;communication bandwidth;intracluster;intercluster;",
	number = 1,
	pages = "1 - 16",
	title = "{A} clustering method for modeling the communication requirements of message-passing applications",
	volume = 21,
	year = 2002
}

B Caminero, C Carrion, F J Quiles, Jose Duato and S Yalamanchili. A multimedia router architecture to provide high performance and QoS guarantees to mixed traffic. 2002, 313 - 16. URL BibTeX

@conference{ 7540635,
	author = "B. Caminero and C. Carrion and F.J. Quiles and Duato, Jose and S. Yalamanchili",
	abstract = "The explosive growth in using scalable and cost-effective clusters and local area environments involve the design of high performance networks aimed at providing QoS to multimedia flows. Thus, the main goal pursued by the Multi-Media (MMR) project is to design a single-chip router able to efficiently handle multimedia flows and best-effort traffic. In this paper we focus on the performance evaluation of the MMR architecture using a mix of CBR, VBR and best effort workload. Preliminary simulation results show that, by using simple link and switch scheduling algorithms, the router is able to achieve a link bandwidth utilization of 80%, while still providing QoS guarantees to both CBR and VBR traffic in the presence of best-effort traffic",
	address = "Piscataway, NJ, USA",
	journal = "Proceedings 2002 IEEE International Conference on Multimedia and Expo (Cat. No.02TH8604)",
	keywords = "bandwidth allocation;multimedia communication;quality of service;telecommunication network routing;telecommunication traffic;",
	note = "multimedia router architecture;QoS guarantees;Multi-Media project;MMR project;single-chip router;best-effort traffic;performance evaluation;CBR;VBR;link bandwidth utilization;mixed traffic;multimedia flows;high performance networks;",
	pages = "313 - 16",
	title = "{A} multimedia router architecture to provide high performance and {Q}o{S} guarantees to mixed traffic",
	url = "http://dx.doi.org/10.1109/ICME.2002.1035781",
	volume = "vol.1",
	year = 2002
}

B Caminero, C Carrion, F J Quiles, Jose Duato and S Yalamanchili. A new switch scheduling algorithm to improve QoS in the multimedia router. 2002, 376 - 9. URL BibTeX

@conference{ 7810072,
	author = "B. Caminero and C. Carrion and F.J. Quiles and Duato, Jose and S. Yalamanchili",
	abstract = "The multimedia router (MMR) is aimed at providing QoS to multimedia flows, which coexist with conventional best-effort traffic, by means of a single-chip, compact router designed for cluster and local area environments. As the router is based on a multiplexed crossbar, hardware efficient link and switch scheduling algorithms are needed. Their goal is to achieve a high utilization, while the QoS needed by the multimedia connections is guaranteed. This work presents a novel switch scheduling algorithm, the candidate conflict arbiter (CCA), that can be efficiently implemented in the MMR. Simulation results show that this proposal beats other previous algorithms in terms of maximum throughput achieved while still providing QoS to the multimedia flows",
	address = "Piscataway, NJ, USA",
	journal = "Proceedings of 2002 IEEE Workshop on Multimedia Signal Processing (Cat. No.02TH8661)",
	keywords = "local area networks;multimedia communication;quality of service;scheduling;telecommunication network routing;telecommunication switching;",
	note = "switch scheduling algorithm;QoS;multimedia router;quality of service;best-effort traffic;LAN;single-chip;multiplexed crossbar;hardware efficient link;candidate conflict arbiter;multimedia flow;cluster;local area environment;",
	pages = "376 - 9",
	title = "{A} new switch scheduling algorithm to improve {Q}o{S} in the multimedia router",
	url = "http://dx.doi.org/10.1109/MMSP.2002.1203324",
	year = 2002
}

F J Alfaro, J L Sanchez, Jose Duato and C R Das. A strategy to compute the InfiniBand arbitration tables. 2002, 43 - 8. URL BibTeX

@conference{ 7342290,
	author = "F.J. Alfaro and J.L. Sanchez and Duato, Jose and C.R. Das",
	abstract = "The InfiniBand Architecture (IBA) is a new industry standard architecture for server I/O and interprocessor communication. InfiniBand is very likely to become the de facto standard in a few years. It is being developed by the InfiniBand Trade Association (IBTA) to provide the levels of reliability, availability, performance, scalability, and quality of service (QoS) necessary for present and future server systems. The provision of QoS in data communication networks is currently the focus of much discussion and research in industry and academia. IBA enables QoS support with some mechanisms. In this paper, we examine these mechanisms and describe a way to use them. We propose a traffic segregation strategy based on mean bandwidth requirements. Moreover, we propose a very effective strategy to compute the virtual lane arbitration tables for IBA switches. We evaluate our proposal with different network topologies. Performance results show that, with a correct treatment of each traffic class in the arbitration of the output port, every traffic class meets its QoS requirements",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings 16th International Parallel and Distributed Processing Symposium",
	keywords = "quality of service;standards;system buses;",
	note = "InfiniBand arbitration tables;industry standard architecture;server I/O;interprocessor communication;reliability;availability;performance;scalability;quality of service;data communication networks;traffic segregation strategy;mean bandwidth requirements;virtual lane arbitration tables;QoS requirements;",
	pages = "43 - 8",
	title = "{A} strategy to compute the {I}nfini{B}and arbitration tables",
	url = "http://dx.doi.org/10.1109/IPDPS.2002.1015474",
	year = 2002
}

M E Acacio, J Gonzalez, J M Garcia and Jose Duato. A. novel approach to reduce L2 miss latency in shared-memory multiprocessors. 2002, 580 - 7. URL BibTeX

@conference{ 7342351,
	author = "M.E. Acacio and J. Gonzalez and J.M. Garcia and Duato, Jose",
	abstract = "Recent technology improvements allow multiprocessor designers to put some key components inside the processor chip, such as the memory controller, the coherence hardware and the network interface/router. In this work we exploit such integration scale, presenting a novel node architecture aimed at reducing the long L2 miss latencies and the memory overhead of using directories that characterize cc-NUMA machines and limit their scalability. Our proposal replaces the traditional directory with a novel three-level directory architecture and adds a small shared data cache to each of the nodes of a multiprocessor system. Due to their small size, the first-level directory and the shared data cache are integrated into the processor chip in every node. A taxonomy of the L2 misses, according to the actions performed by the directory to satisfy them is also presented. Using execution-driven simulations, we show significant L2 miss latency reductions (more than 60% in some cases). These important improvements translate into reductions of more than 30% in the application execution time in some cases",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings 16th International Parallel and Distributed Processing Symposium",
	keywords = "cache storage;parallel architectures;performance evaluation;shared memory systems;",
	note = "L2 miss latency reduction;shared-memory multiprocessors;memory controller;coherence hardware;network interface;node architecture;memory overhead;cc-NUMA machines;shared data cache;execution-driven simulations;scalability;three-level directory architecture;",
	pages = "580 - 7",
	title = "{A}. novel approach to reduce {L}2 miss latency in shared-memory multiprocessors",
	url = "http://dx.doi.org/10.1109/IPDPS.2002.1015554",
	year = 2002
}

I Paul, S Yalamanchili and Jose Duato. Algorithms for switch-scheduling in the multimedia router for LANs. 2002, 219 - 31. BibTeX

@conference{ 7748982,
	author = "I. Paul and S. Yalamanchili and Duato, Jose",
	abstract = "The primary objective of the multimedia router (MMR) [Jose Duato et al., (1999)] project is to design and implement a single chip router targeted for use in cluster and LAN interconnection networks. The goal can be concisely captured in the phrase 'QoS routing at link speeds'. We study a set of algorithms for switch-scheduling based on a highly concurrent implementation for capturing output port requests. Two different switch-scheduling algorithms called row-column ordering and diagonal ordering are proposed and implemented in a switch-scheduling framework which involves a matrix data structure, and therefore enables concurrent and parallel operations at high-speed. Their performance has been evaluated with constant bit rate (CBR), variable bit rate (VBR), and a mixture of CBR and VBR traffic. At high offered loads both these ordering functions have been shown to deliver superior quality of service (QoS) to connections at a high scheduling rate and high utilization",
	address = "Berlin, Germany",
	journal = "High Performance Computing - HiPC 2002. 9th International Conference. Proceedings (Lecture Notes in Computer Science Vol.2552)",
	keywords = "concurrency control;LAN interconnection;multimedia communication;packet switching;parallel processing;quality of service;telecommunication network routing;telecommunication traffic;",
	note = "multimedia router;MMR;LAN interconnection network;QoS routing;quality of service;output port request;switch-scheduling algorithm;row-column ordering;diagonal ordering;matrix data structure;concurrent operation;parallel operation;constant bit rate;variable bit rate;",
	pages = "219 - 31",
	title = "{A}lgorithms for switch-scheduling in the multimedia router for {LAN}s",
	year = 2002
}

J C Sancho, Jose Flich, Antonio Robles, Pedro Lopez and Jose Duato. Analyzing the influence of virtual lanes on the performance of infiniband networks. In Parallel and Distributed Processing Symposium., Proceedings International, IPDPS 2002, Abstracts and CD-ROM. 2002, 166 -175. BibTeX

@conference{ 1016568,
	author = "J.C. Sancho and Flich, Jose and Robles, Antonio and Lopez, Pedro and Duato, Jose",
	booktitle = "Parallel and Distributed Processing Symposium., Proceedings International, IPDPS 2002, Abstracts and CD-ROM",
	pages = "166 -175",
	title = "{A}nalyzing the influence of virtual lanes on the performance of infiniband networks",
	year = 2002
}

Elvira Baydal, Pedro Lopez and Jose Duato. Avoiding network congestion with local information. 2002, 35 - 48. URL BibTeX

@conference{ 20093412265277,
	author = "Baydal, Elvira and Lopez, Pedro and Duato, Jose",
	abstract = "Congestion leads to a severe performance degradation in multiprocessor interconnection networks. Therefore, the use of techniques that prevent network saturation are of crucial importance. Some recent proposals use global network information, thus requiring that nodes exchange some control information, which consumes a far from negligible bandwidth. As a consequence, the behavior of these techniques in practice is not as good as expected. In this paper, we propose a mechanism that uses only local information to avoid network saturation. Each node estimates traffic locally by using the percentage of free virtual output channels that can be used to forward a message towards its destination. When this number is below a threshold value, network congestion is assumed to exist and message throttling is applied. The main contributions of the proposed mechanism are two: i) it is more selective than previous approaches, as it only prevents the injection of messages when they are destined to congested areas; and ii) it outperforms recent proposals that rely on global information. © 2002 Springer Berlin Heidelberg.",
	address = "Kansai Science City, Japan",
	issn = "0302-9743",
	journal = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
	key = "Interconnection networks",
	keywords = "Computer science;Telecommunication networks;",
	note = "Control information;Global informations;Global network information;Local information;Message throttling;Multiprocessor interconnections;Network congestions;Network saturation;Performance degradation;Virtual output;",
	pages = "35 - 48",
	title = "{A}voiding network congestion with local information",
	url = "http://dx.doi.org/10.1007/3-540-47847-7_6",
	volume = "2327 LNCS",
	year = 2002
}

Jose Flich, Pedro Lopez, M P Malumbres and Jose Duato. Boosting the performance of Myrinet networks. Parallel and Distributed Systems, IEEE Transactions on 13(11):1166 - 1182, November 2002. URL, DOI BibTeX

@article{ 1058099,
	author = "Flich, Jose and Lopez, Pedro and M.P. Malumbres and Duato, Jose",
	abstract = "Networks of workstations (NOWs) are becoming increasingly popular as a cost-effective alternative to parallel computers. These networks allow the customer to connect processors using irregular topologies, providing the wiring flexibility, scalability, and incremental expansion capability required in this environment. Some of these networks use source routing and wormhole switching. In particular, we are interested in Myrinet networks because it is a well-known commercial product and its behavior can be controlled by the software running in network interfaces (Myrinet Control Program, MCP). Usually, the Myrinet network uses up*/down* routing for computing the paths for every source-destination pair. We propose the In-Transit Buffer (ITB) mechanism to improve network performance. We apply the ITB mechanism to NOWs with up*/down* source routing, like Myrinet, analyzing its behavior on both networks with regular and irregular topologies. The proposed scheme can be implemented on Myrinet networks by only modifying the MCP, without changing the network hardware. We evaluate by simulation several networks with different traffic patterns using timing parameters taken from the Myrinet network. Results show that the current routing schemes used in Myrinet networks can be strongly improved by applying the ITB mechanism. In general, our proposed scheme is able to double the network throughput on medium and large NOWs. Finally, we present a first implementation of the ITB mechanism on a Myrinet network.",
	doi = "10.1109/TPDS.2002.1058099",
	issn = "1045-9219",
	journal = "Parallel and Distributed Systems, IEEE Transactions on",
	keywords = "In-Transit Buffer; Myrinet network; irregular topologies; network interfaces; network performance boosting; network traffic; parallel computers; performance evaluation; scalability; simulation; throughput; up down source routing; workstation networks; wo",
	month = "nov",
	number = 11,
	pages = "1166 - 1182",
	title = "{B}oosting the performance of {M}yrinet networks",
	url = "http://dx.doi.org/10.1109/TPDS.2002.1058099",
	volume = 13,
	year = 2002
}

Jose Flich, Pedro Lopez, M P Malumbres and Jose Duato. Boosting the performance of Myrinet networks. Parallel and Distributed Systems, IEEE Transactions on 13(7):693 -709, July 2002. URL, DOI BibTeX

@article{ 1019859,
	author = "Flich, Jose and Lopez, Pedro and M.P. Malumbres and Duato, Jose",
	abstract = "Networks of workstations (NOWs) are becoming increasingly popular as a cost-effective alternative to parallel computers. These networks allow the customer to connect processors using irregular topologies, providing the wiring flexibility, scalability and incremental expansion capability required in this environment. Some of these networks use source routing and wormhole switching. In particular, we are interested in Myrinet networks because they are a well-known commercial product and their behavior can be controlled by the software running on the network interfaces (the Myrinet Control Program, MCP). Usually, the Myrinet network uses up*/down* routing for computing the paths for every source-destination pair. In this paper, we propose an in-transit buffer (ITB) mechanism to improve the network performance. We apply the ITB mechanism to NOWs with up*/down* source routing, like the Myrinet, analyzing its behavior on networks with both regular and irregular topologies. The proposed scheme can be implemented on Myrinet networks by simply modifying the MCP, without changing the network hardware. We evaluate by simulation several networks with different traffic patterns using timing parameters taken from the Myrinet network. The results show that the current routing schemes used in Myrinet networks can be strongly improved by applying the ITB mechanism. In general, our proposed scheme is able to double the network throughput on medium and large NOWs. Finally, we present a first implementation of the ITB mechanism on a Myrinet network",
	doi = "10.1109/TPDS.2002.1019859",
	issn = "1045-9219",
	journal = "Parallel and Distributed Systems, IEEE Transactions on",
	keywords = "Myrinet Control Program;Myrinet network performance;in-transit buffer mechanism;incremental expansion capability;irregular topologies;minimal routing;network interfaces;network throughput;network traffic patterns;performance evaluation;regular topologies;",
	month = "jul",
	number = 7,
	pages = "693 -709",
	title = "{B}oosting the performance of {M}yrinet networks",
	url = "http://dx.doi.org/10.1109/TPDS.2002.1019859",
	volume = 13,
	year = 2002
}

Jose Flich, Pedro Lopez, Perez M Malumbres and Jose Duato. Boosting the performance of Myrinet networks. IEEE Transactions on Parallel and Distributed Systems 13(7):693 - 709, 2002. URL BibTeX

@article{ 2002367073594,
	author = "Flich, Jose and Lopez, Pedro and M. Perez Malumbres and Duato, Jose",
	abstract = "Networks of workstations (NOWs) are becoming increasingly popular as a cost-effective alternative to parallel computers. These networks allow the customer to connect processors using irregular topologies, providing the wiring flexibility, scalability, and incremental expansion capability required in this environment. Some of these networks use source routing and wormhole switching. In particular, we are interested in Myrinet networks because it is a well-known commercial product and its behavior can be controlled by the software running in network interfaces (Myrinet Control Program, MCP). Usually, the Myrinet network uses up*/down* routing for computing the paths for every source-destination pair. In this paper, we propose the In-Transit Buffer (ITB) mechanism to improve network performance. We apply the ITB mechanism to NOWs with up*/down* source routing, like Myrinet, analyzing its behavior on both networks with regular and irregular topologies. The proposed scheme can be implemented on Myrinet networks by only modifying the MCP, without changing the network hardware. We evaluate by simulation several networks with different traffic patterns using timing parameters taken from the Myrinet network. Results show that the current routing schemes used in Myrinet networks can be strongly improved by applying the ITB mechanism. In general, our proposed scheme is able to double the network throughput on medium and large NOWs. Finally, we present a first implementation of the ITB mechanism on a Myrinet network.",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Computer networks",
	keywords = "Buffer storage;Computer hardware;Computer simulation;Computer workstations;Interfaces;Parallel processing systems;Program processors;Routers;Telecommunication traffic;Topology;",
	note = "Myrinet networks;",
	number = 7,
	pages = "693 - 709",
	title = "{B}oosting the performance of {M}yrinet networks",
	url = "http://dx.doi.org/10.1109/TPDS.2002.1019859",
	volume = 13,
	year = 2002
}

Salvador Petit, Julio Sahuquillo and A Pont. Characterizing parallel workloads to reduce multiple writer overhead in shared virtual memory systems. In Parallel, Distributed and Network-based Processing, 2002. Proceedings. 10th Euromicro Workshop on. 2002, 261 -268. URL, DOI BibTeX

@conference{ 994285,
author = "Petit, Salvador and Sahuquillo, Julio and A. Pont",
abstract = "Shared virtual memory (SVM) systems, because of their software implementation, enable shared-memory programming at a low design and maintenance cost. Nevertheless, as hardware implementations become faster, their performance is still far from that achieved by distributed shared memory (DSM) systems. Nowadays, SVM systems use relaxed memory consistency models and multiple writer protocols as techniques to reduce latencies and false sharing, respectively. However, these techniques induce additional overhead that decreases system performance. We performed a study of workload behavior aimed at improving the design of SVM protocols. The work focused on the identification of the type of shared data patterns that can appear in the accesses to protected sections using semaphores. Most coherence actions in SVM systems are performed as a consequence of the write operations executed in critical sections, so we pay special attention to the write operations performed when multiple writers are allowed. As these write operations may present spatial locality, we also study the write patterns on shared pages with similar behaviour. Different software filters are applied in the instrumented parallel workloads selected to capture and classify the most common sharing patterns. This enables the recognition of those patterns in which coherence overhead can be reduced by modifying the coherence actions performed by the protocol. Despite the fact that the performance evaluation of new coherence solutions is not our main goal, the ideas presented to improve the behaviour of SVM systems can be implemented at a reasonable hardware/software cost",
booktitle = "Parallel, Distributed and Network-based Processing, 2002. Proceedings. 10th Euromicro Workshop on",
doi = "10.1109/EMPDP.2002.994285",
isbn = "0-7695-1444-8",
keywords = "coherence actions;coherence overhead;critical sections;design cost;false sharing reduction;hardware cost;hardware implementations;instrumented parallel workloads;latency reduction;maintenance cost;memory consistency protocols;multiple writer protocols;par",
pages = "261 -268",
title = "{C}haracterizing parallel workloads to reduce multiple writer overhead in shared virtual memory systems",
url = "http://dx.doi.org/10.1109/EMPDP.2002.994285",
year = 2002
}

Elvira Baydal, Pedro Lopez and Jose Duato. Congestion control based on transmission times. 2002, 781 - 90. BibTeX

@conference{ 7568238,
	author = "Baydal, Elvira and Lopez, Pedro and Duato, Jose",
	abstract = "Congestion leads to a severe performance degradation in multiprocessor interconnection networks. Therefore, the use of techniques that prevent network saturation are of crucial importance to avoid high execution times. We propose a new mechanism that uses only local information to avoid network saturation in wormhole networks. In order to detect congestion, each network node computes the quotient between the real transmission time of messages and its minimum theoretical value. If this ratio is greater than a threshold, the physical channel used by the message is considered congested. Depending on the number of congested channels, the available bandwidth to inject messages is reduced. The main contributions of the new mechanism are three: i) it can detect congestion in a remote way, but without transmitting control information through the network; ii) it tries to dynamically adjust the effective injection bandwidth available at each node; and iii) it is starvation-free. Evaluation results show that the proposed mechanism avoids network performance degradation for different network loads and topologies. Indeed, the mechanism does not introduce any penalty for low and medium network loads, where no congestion control mechanism is required",
	address = "Berlin, Germany",
	journal = "Euro-Par 2002 Parallel Processing. 8th International Euro-Par Conference. Proceedings (Lecture Notes in Computer Science Vol.2400)",
	keywords = "multiprocessor interconnection networks;network routing;parallel architectures;parallel machines;performance evaluation;",
	note = "congestion control;transmission times;performance degradation;multiprocessor interconnection networks;network saturation;execution times;massively parallel computers;wormhole networks;bandwidth;starvation-free;network topologies;",
	pages = "781 - 90",
	title = "{C}ongestion control based on transmission times",
	year = 2002
}

J C Sancho, Antonio Robles, Jose Flich, Pedro Lopez and Jose Duato. Effective methodology for deadlock-free minimal routing in InfiniBand networks. In Parallel Processing, 2002. Proceedings. International Conference on. 2002, 409 - 418. DOI BibTeX

@conference{ 1040897,
	author = "J.C. Sancho and Robles, Antonio and Flich, Jose and Lopez, Pedro and Duato, Jose",
	abstract = "The InfiniBand Architecture (IBA) defines a switch-based network with point-to-point links whose topology is arbitrarily established by the customer. We propose a simple and effective methodology for designing deadlock-free routing strategies that are able to route packets through minimal paths in InfiniBand networks. This methodology can meet the trade-off between network performance and the number of resources dedicated to deadlock avoidance. Evaluation results show that the resulting routing strategies significantly outperform up*/down* routing. In particular, throughput improvement ranges, on average, from 1.33 for small networks to 4.05 for large networks. Also, it is shown that just two virtual lanes and three service levels are enough to achieve more than 80% of the throughput improvement achieved by the best proposed routing strategy (the one that always provides minimal paths without limiting the number of resources).",
	booktitle = "Parallel Processing, 2002. Proceedings. International Conference on",
	doi = "10.1109/ICPP.2002.1040897",
	issn = "0190-3918",
	keywords = "InfiniBand architecture; InfiniBand networks; NOWs; deadlock-free minimal routing; interconnection pattern; minimal paths; network performance; packet routing; point-to-point links; service levels; switch-based network; throughput improvement; up*/down*",
	pages = "409 - 418",
	title = "{E}ffective methodology for deadlock-free minimal routing in {I}nfini{B}and networks",
	year = 2002
}

P J Garcia, M D Mora, F J Alfaro, J L Sanchez and Jose Flich. Evaluation of alternative arbitration policies fo myrinet switches. In Parallel and Distributed Processing Symposium., Proceedings International, IPDPS 2002, Abstracts and CD-ROM. 2002, 162 -169. BibTeX

@conference{ 1016560,
	author = "P.J. Garcia and M.D. Mora and F.J. Alfaro and J.L. Sanchez and Flich, Jose",
	booktitle = "Parallel and Distributed Processing Symposium., Proceedings International, IPDPS 2002, Abstracts and CD-ROM",
	pages = "162 -169",
	title = "{E}valuation of alternative arbitration policies fo myrinet switches",
	year = 2002
}

Maria E Gomez, Jose Flich, Antonio Robles, Pedro Lopez and Jose Duato. Evaluation of routing algorithms for InfiniBand networks. 2002, 775 - 80. BibTeX

@conference{ 7568237,
	author = "Gomez, Maria E. and Flich, Jose and Robles, Antonio and Lopez, Pedro and Duato, Jose",
	abstract = "Storage area networks (SAN) provide the scalability required by the IT servers. The InfiniBand (IBA) interconnect is very likely to become the de facto standard for SAN as well as for NOW. The routing algorithm is a key design issue in irregular networks. Moreover, as several virtual lanes can be used and different network issues can be considered, the performance of the routing algorithms may be affected. In this paper we evaluate three existing routing algorithms (up*/down*, DFS, and smart-routing) suitable for being applied to IBA. Evaluation has been performed by simulation under different synthetic traffic patterns and I/O traces. Simulation results show that the smart-routing algorithm achieves the highest performance",
	address = "Berlin, Germany",
	journal = "Euro-Par 2002 Parallel Processing. 8th International Euro-Par Conference. Proceedings (Lecture Notes in Computer Science Vol.2400)",
	keywords = "parallel algorithms;performance evaluation;telecommunication network routing;telecommunication standards;telecommunication traffic;workstation clusters;",
	note = "routing algorithms;InfiniBand networks;storage area networks;SAN;scalability;de facto standard;IBA interconnect;NOW;irregular networks;virtual lanes;performance;up*/down* routing;DFS routing;smart routing;synthetic traffic patterns;I/O traces;simulation;IT servers;",
	pages = "775 - 80",
	title = "{E}valuation of routing algorithms for {I}nfini{B}and networks",
	year = 2002
}

Jose Flich, Pedro Lopez, J C Sancho, Antonio Robles and Jose Duato. Improving InfiniBand routing through multiple virtual networks. 2002, 49 - 63. BibTeX

@conference{ 7387421,
	author = "Flich, Jose and Lopez, Pedro and J.C. Sancho and Robles, Antonio and Duato, Jose",
	abstract = "InfiniBand is very likely to become the de facto standard for communication between nodes and I/O devices as well as for interprocessor communication. Often, the interconnection pattern is irregular. Up*/down* is the most popular routing scheme currently used in NOWs with irregular topologies. However, the main drawbacks of up*/down* routing are the unbalanced channel utilization and the difficulties to route most packets through minimal paths, which negatively affects network performance. Using additional virtual lanes can improve up*/down* routing performance by reducing the head-of-line blocking effect, but its use is not aimed to remove its main drawbacks. We propose a methodology that uses a reduced number of virtual lanes in an efficient way to achieve a better traffic balance and a higher number of minimal paths. This methodology is based on routing packets simultaneously through several properly selected up*/down* trees. To guarantee deadlock freedom, each up*/down* tree is built over a different virtual network. Simulation results, show that the proposed methodology increases throughput up to an average factor ranging from 1.18 to 2.18 for 8, 16, and 32-switch networks by using only two virtual lanes. For larger networks with an additional virtual lane, network throughput is tripled, on average",
	address = "Berlin, Germany",
	journal = "High Performance Computing. 4th International Symposium, ISHPC 2002. Proceedings (Lecture Notes in Computer Science Vol.2327)",
	keywords = "multiplexing;multiprocessor interconnection networks;telecommunication network routing;workstation clusters;",
	note = "InfiniBand routing;networks of workstations;multiple virtual networks;interprocessor communication;NOWs;switch-based network;point-to-point links;up*/down* routing;head-of-line blocking effect;deadlock freedom;",
	pages = "49 - 63",
	title = "{I}mproving {I}nfini{B}and routing through multiple virtual networks",
	year = 2002
}

J Fernandez, J M Garcia and Jose Duato. Improving the performance of real-time communication services on high-speed LANs under topology changes. 2002, 385 - 94. URL BibTeX

@conference{ 7670833,
	author = "J. Fernandez and J.M. Garcia and Duato, Jose",
	abstract = "In this paper, we propose and evaluate a new protocol that provides topology change- and fault-tolerant real-time communication services on NOW and clusters. This protocol overcomes the main drawback of our previously proposed protocol, called Dynamically Re-established Real-Time Channels (DRRTC), which is physically limited by the number of virtual channels per port. The new protocol allows different real-time channels to share the same virtual channel. In this way, the new protocol allows us to establish a greater number of real-time channels than the previous one. Moreover, its only limitation is the bandwidth devoted to real-time traffic. However, this introduces two new problems that are successfully managed by the new protocol: the existence of cyclic dependencies among different real-time channels and the increased complexity of deadline requirements. We present and analyze the performance evaluation results when a single switch or a single link is deactivated/activated for different topologies and workloads. The new protocol overwhelms the DRRTC protocol while guaranteeing deadline requirements and channel recovery",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings LCN 2002. 27th Annual IEEE Conference on Local Computer Networks",
	keywords = "fault tolerance;network topology;performance evaluation;protocols;quality of service;workstation clusters;",
	note = "real-time communication services;high-speed LAN;topology changes;protocol;fault-tolerant real-time communication;NOW;clusters;Dynamically Re-established Real-Time Channels;DRRTC;virtual channel;cyclic dependencies;performance evaluation;deadline requirements;channel recovery;",
	pages = "385 - 94",
	title = "{I}mproving the performance of real-time communication services on high-speed {LAN}s under topology changes",
	url = "http://dx.doi.org/10.1109/LCN.2002.1181810",
	year = 2002
}

Elvira Baydal, Pedro Lopez and Jose Duato. Increasing the adaptivity of routing algorithms for k-ary n-cubes. In Parallel, Distributed and Network-based Processing, 2002. Proceedings. 10th Euromicro Workshop on. 2002, 455 -462. URL, DOI BibTeX

@conference{ 994333,
	author = "Baydal, Elvira and Lopez, Pedro and Duato, Jose",
	abstract = "In this paper, we show that routing algorithms may exploit not only the flexibility obtained by crossing network dimensions in any order but also that obtained in the same network dimension, thanks to the availability of bidirectional channels. We analyze the behavior of adaptive routing algorithms both for deadlock avoidance and recovery, exploiting this increased routing flexibility, and compare them with previous proposals in order to evaluate the contribution of the additional routing freedom on network performance. Simulation results show that this simple improvement in the routing algorithm allows one to achieve throughput improvements of up to 45% in networks with low radix, for a uniform distribution of message destinations",
	booktitle = "Parallel, Distributed and Network-based Processing, 2002. Proceedings. 10th Euromicro Workshop on",
	doi = "10.1109/EMPDP.2002.994333",
	isbn = "0-7695-1444-8",
	keywords = "adaptive routing algorithms;additional routing freedom;algorithm adaptivity;bidirectional channels;deadlock avoidance;deadlock recovery;hypercube networks;k-ary n-cubes;network dimension crossing;network performance;network radix;routing flexibility;simul",
	pages = "455 -462",
	title = "{I}ncreasing the adaptivity of routing algorithms for k-ary n-cubes",
	url = "http://dx.doi.org/10.1109/EMPDP.2002.994333",
	year = 2002
}

Elvira Baydal, Pedro Lopez and Jose Duato. Increasing the adaptivity of routing algorithms for k-ary n-cubes. 2002, 455 - 62. URL BibTeX

@conference{ 7205121,
	author = "Baydal, Elvira and Lopez, Pedro and Duato, Jose",
	abstract = "In this paper, we show that routing algorithms may exploit not only the flexibility obtained by crossing network dimensions in any order but also that obtained in the same network dimension, thanks to the availability of bidirectional channels. We analyze the behavior of adaptive routing algorithms both for deadlock avoidance and recovery, exploiting this increased routing flexibility, and compare them with previous proposals in order to evaluate the contribution of the additional routing freedom on network performance. Simulation results show that this simple improvement in the routing algorithm allows one to achieve throughput improvements of up to 45% in networks with low radix, for a uniform distribution of message destinations",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings 10th Euromicro Workshop on Parallel, Distributed and Network-based Processing",
	keywords = "adaptive systems;concurrency control;hypercube networks;network routing;parallel algorithms;performance evaluation;system recovery;",
	note = "adaptive routing algorithms;algorithm adaptivity;k-ary n-cubes;hypercube networks;wormhole switching;routing flexibility;network dimension crossing;bidirectional channels;deadlock avoidance;deadlock recovery;additional routing freedom;network performance;simulation;throughput;network radix;uniform message destination distribution;",
	pages = "455 - 62",
	title = "{I}ncreasing the adaptivity of routing algorithms for k-ary n-cubes",
	url = "http://dx.doi.org/10.1109/EMPDP.2002.994333",
	year = 2002
}

Ki Hwan Yum, Eun Jung Kim, C R Das, M Yousif and Jose Duato. Integrated admission and congestion control for QoS support in clusters. 2002, 325 - 32. URL BibTeX

@conference{ 7503603,
	author = "Ki Hwan Yum and Eun Jung Kim and C.R. Das and M. Yousif and Duato, Jose",
	abstract = "Admission and congestion control mechanisms are integral parts of any Quality of Service (QoS) design for networks that support integrated traffic. In this paper we propose an-admission control algorithm and a congestion control algorithm for clusters, which are increasingly being used in a diverse set of applications that require QoS guarantees. The uniqueness of our approach is that we develop these algorithms for wormhole-switched networks. We use QoS-capable wormhole routers and QoS-capable network interface cards (NICs), referred to as Host Channel Adapters (HCAs) in InfiniBand{{\&}}trade; Architecture (IBA), to evaluate the effectiveness of these algorithms. The admission control is applied at the HCAs and the routers, while the congestion control is deployed only at the HCAs. Simulation results indicate that the admission and congestion control algorithms are quite effective in delivering the assured performance. The proposed credit-based congestion control algorithm is simple and practical in that it relies on hardware already available in the HCA to regulate traffic injection",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings 2002 IEEE International Conference on Cluster Computing",
	keywords = "quality of service;telecommunication congestion control;telecommunication network routing;workstation clusters;",
	note = "admission control;congestion control;Quality of Service;integrated traffic;clusters;wormhole-switched networks;network interface cards;Host Channel Adapters;",
	pages = "325 - 32",
	title = "{I}ntegrated admission and congestion control for {Q}o{S} support in clusters",
	url = "http://dx.doi.org/10.1109/CLUSTR.2002.1137761",
	year = 2002
}

G Bernabe, J Gonzalez, J M Garcia and Jose Duato. Memory conscious 3D wavelet transform. 2002, 108 - 13. URL BibTeX

@conference{ 7480885,
	author = "G. Bernabe and J. Gonzalez and J.M. Garcia and Duato, Jose",
	abstract = "The video compression algorithms based on the 3D wavelet transform obtain excellent compression rates at the expense of huge memory requirements, which drastically affect the execution time of such applications. The goal of this work is to mitigate the memory problem by exploiting the memory hierarchy of the processor through blocking. In particular, we present two blocking approaches: cube and rectangular that differ in the way that the original working set is divided. We also propose the reuse of previous computations in order to decrease the number of memory accesses and floating point operations. Results show that the rectangular overlapped approach with computation reuse obtains the best results in terms of execution time, a speedup of 2.42 over the non-blocking non-overlapped wavelet transform, maintaining the compression ratio and the video quality (PSNR) of the original encoder based on the 3D wavelet transform",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings 28th Euromicro Conference",
	keywords = "data compression;medical image processing;performance evaluation;storage management;transform coding;video coding;wavelet transforms;",
	note = "3D wavelet transform;processor memory hierarchy;cube blocking;rectangular blocking;previous computation reuse;floating point operations;rectangular overlapped approach;execution time;speedup;compression ratio;medical video;video quality;PSNR;video compression algorithms;",
	pages = "108 - 13",
	title = "{M}emory conscious 3{D} wavelet transform",
	url = "http://dx.doi.org/10.1109/EURMIC.2002.1046141",
	year = 2002
}

F J Alfaro, J L Sanchez, Luis Orozco and Jose Duato. Performance evaluation of VBR traffic in InfiniBand. 2002, 1532 - 1537. URL BibTeX

@conference{ 2002317038403,
	author = "F.J. Alfaro and J.L. Sanchez and Luis Orozco and Duato, Jose",
	abstract = "The InfiniBand Architecture (IBA) is becoming an industry standard both for communication between processing nodes and I/O devices and for interprocessor communication. It replaces the traditional I/O bus with a switch-based interconnect for connecting processing nodes and I/O devices. It is being developed by the InfiniBand^SM Trade Association (IBTA) to provide the levels of reliability, availability, performance, scalability, and quality of service (QoS) necessary for present and future server systems. For this, IBA provides a series of mechanisms that are able to guarantee QoS to the applications. In [2, 4], we proposed a strategy to compute the InfiniBand arbitration tables. We only evaluated our proposal for CBR traffic with fixed mean bandwidth requirements. In this paper, we evaluate our strategy to compute the InfiniBand arbitration tables with VBR traffic. Performance results show that, this class of traffic also gets their QoS requirements.",
	address = "Winnipeg, Manitoba, Canada",
	issn = 08407789,
	journal = "Canadian Conference on Electrical and Computer Engineering",
	key = "Telecommunication networks",
	keywords = "Bandwidth;Packet switching;Quality of service;Servers;Telecommunication traffic;",
	note = "InfiniBand Architecture (IBA);",
	pages = "1532 - 1537",
	title = "{P}erformance evaluation of {VBR} traffic in {I}nfini{B}and",
	url = "http://dx.doi.org/10.1109/CCECE.2002.1012981",
	volume = 3,
	year = 2002
}

JC Sancho, Antonio Robles and Jose Duato. Performance sensitivity of routing algorithms to failures in networks of workstations with regular and irregular topologies. In F Vajda and N Podhorszki (eds.). 10TH EUROMICRO WORKSHOP ON PARALLEL, DISTRIBUTED AND NETWORK-BASED PROCESSING, PROCEEDINGS. 2002, 81-90. BibTeX

@conference{ isi:000173566600010,
	author = "JC Sancho and Robles, Antonio and Duato, Jose",
	abstract = "Networks of workstations (NOWs) provide a cost-effective alternative to parallel computers. Components in NOWs may fail, degrading the network operation until the faults are repaired. In this paper, we analyze the influence of both switch and link failures on the network performance. In particular, given that network performance in NOWs strongly depends on the applied routing algorithm, we quantify the sensitivity to failures of two routing algorithms: flexible routing and up{*}/down{*} routing algorithms. In the case of up{*}/down{*} routing, two methodologies to compute routing tables are evaluated. Evaluation results modeling a Myrinet network show that, in general, up{*}/down{*} routing is more robust to failures, although its behavior strongly depends on the type of network topology, regular or irregular, and the methodology used to compute routing tables. However, the flexible routing algorithm presents a better performance, regardless of the network topology, even in presence of failures, but at expense of a larger sensitivity.",
	booktitle = "10TH EUROMICRO WORKSHOP ON PARALLEL, DISTRIBUTED AND NETWORK-BASED PROCESSING, PROCEEDINGS",
	editor = "Vajda, F and Podhorszki, N",
	isbn = 0769514448,
	note = "10th Euromicro Workshop on Parallel, Distributed and Network-based Processing (PDP 2002), LAS PALMAS GC, SPAIN, JAN 09-11, 2002",
	pages = "81-90",
	title = "{P}erformance sensitivity of routing algorithms to failures in networks of workstations with regular and irregular topologies",
	year = 2002
}

J C Sancho, Antonio Robles and Jose Duato. Performance sensitivity of routing algorithms to failures in networks of workstations with regular and irregular topologies. 2002, 81 - 90. URL BibTeX

@conference{ 7205079,
	author = "J.C. Sancho and Robles, Antonio and Duato, Jose",
	abstract = "Networks of workstations (NOWs) provide a cost-effective alternative to parallel computers. Components in NOWs may fail, degrading the network operation until the faults are repaired. In this paper, we analyze the influence of both switch and link failures on the network performance. In particular, given that network performance in NOWs strongly depends on the applied routing algorithm, we quantify the sensitivity to failures of two routing algorithms: flexible routing and up*/down* routing algorithms. In the case of up*/down* routing, two methodologies to compute routing tables are evaluated. Evaluation results modeling a Myrinet network show that, in general, up*/down* routing is more robust to failures, although its behavior strongly depends on the type of network topology, regular or irregular, and the methodology used to compute routing tables. However, the flexible routing algorithm presents a better performance, regardless of the network topology, even in presence of failures, but at expense of a larger sensitivity",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings 10th Euromicro Workshop on Parallel, Distributed and Network-based Processing",
	keywords = "computer networks;performance evaluation;workstation clusters;",
	note = "performance sensitivity;routing algorithms;networks of workstations;irregular topologies;regular topologies;link failures;switch failures;network performance;Myrinet network;",
	pages = "81 - 90",
	title = "{P}erformance sensitivity of routing algorithms to failures in networks of workstations with regular and irregular topologies",
	url = "http://dx.doi.org/10.1109/EMPDP.2002.994237",
	year = 2002
}

M E Acacio, J Gonzalez, J M Garcia and Jose Duato. Reducing the latency of L2 misses in shared-memory multiprocessors through on-chip directory integration. 2002, 368 - 75. URL BibTeX

@conference{ 7205111,
	author = "M.E. Acacio and J. Gonzalez and J.M. Garcia and Duato, Jose",
	abstract = "Recent technology improvements allow multiprocessor designers to put some key components inside the processor chip, such as the memory controller and the network interface. In this paper, we exploit such an integration scale, presenting a new three-level directory architecture aimed at reducing the long L2 miss latencies and the memory overhead that characterize cc-NUMA machines and limit their scalability. The proposed architecture is based on the integration into the processor chip of the directory controller and a small first-level directory cache that stores precise information for the most recently referenced memory lines, as the means to reduce miss latencies. The second- and third-level directories are located near the main memory and they are only accessed when a directory entry for a certain memory line is not present in the first-level directory. This off-chip structure achieves the performance of a large and non-scalable full-map directory with a very significant reduction in the memory overhead. Using execution-driven simulations, we show that substantial latency reductions can be obtained by using the proposed directory architecture. Load, store and read-modify-write misses are significantly accelerated (latency reductions of more than 35% in some cases). These reductions translate into important improvements on the final application performance (reductions up to 20% in execution time)",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings 10th Euromicro Workshop on Parallel, Distributed and Network-based Processing",
	keywords = "cache storage;delays;microprocessor chips;parallel architectures;performance evaluation;shared memory systems;",
	note = "L2 miss latency reduction;shared-memory multiprocessors;on-chip directory integration;technology improvements;memory controller;network interface;integration scale;3-level directory architecture;memory overhead reduction;cc-NUMA machines;cache-coherent nonuniform memory access;scalability;directory controller;directory cache;recently referenced memory lines;main memory;off-chip structure;performance;execution-driven simulations;load misses;store misses;read-modify-write misses;application performance;execution time;",
	pages = "368 - 75",
	title = "{R}educing the latency of {L}2 misses in shared-memory multiprocessors through on-chip directory integration",
	url = "http://dx.doi.org/10.1109/EMPDP.2002.994312",
	year = 2002
}

J Arlandis, J C Perez-Cortes and José Cano Reyes. Rejection strategies and confidence measures for a k-NN classifier in an OCR task. In Pattern Recognition, 2002. Proceedings. 16th International Conference on 1. 2002, 576 - 579 vol.1. URL, DOI BibTeX

@conference{ 1044806,
	author = "J. Arlandis and J.C. Perez-Cortes and Cano Reyes, Jos{\'e}",
	abstract = "In handwritten character recognition, the rejection of extraneous patterns, like image noise, strokes or corrections, can improve significantly the practical usefulness of a system. In this paper a combination of two confidence measures defined for a k-nearest neighbors (NN) classifier is proposed. Experiments are presented comparing the performance of the same system with and without the new rejection rules.",
	booktitle = "Pattern Recognition, 2002. Proceedings. 16th International Conference on",
	doi = "10.1109/ICPR.2002.1044806",
	issn = "1051-4651",
	keywords = "OCR; confidence measures; extraneous pattern rejection; handwritten character recognition; k-nearest neighbors classifier; optical character recognition; probability; handwritten character recognition; optical character recognition; probability;",
	pages = "576 - 579 vol.1",
	title = "{R}ejection strategies and confidence measures for a k-{NN} classifier in an {OCR} task",
	url = "http://dx.doi.org/10.1109/ICPR.2002.1044806",
	volume = 1,
	year = 2002
}

Jose Flich, M P Malumbres, Pedro Lopez and Jose Duato. Removing the latency overhead of the ITB mechanism in COWs with source routing. 2002, 463 - 70. URL BibTeX

@conference{ 7205122,
	author = "Flich, Jose and M.P. Malumbres and Lopez, Pedro and Duato, Jose",
	abstract = "Clusters of workstations (COWs) are becoming increasingly popular as a cost-effective alternative to parallel computers. The in-transit buffer (ITB) mechanism can improve network performance when applied to COWs with irregular topology and source routing. This mechanism considerably improves the performance of this kind of network when compared to current source routing algorithms; however, it introduces a latency penalty. An implementation of this mechanism was performed, showing that the latency overhead of the mechanism may be noticeable, especially for short messages and at low network loads. In this paper, we analyze in detail the latency overhead of ITBs, proposing several mechanisms to reduce, hide and remove it. Firstly, we show, by simulation, the effect of an ITB implementation that is much slower than the one implemented. Then we propose three mechanisms that try to overcome the latency penalty. All the mechanisms are simple and can be easily implemented; also, they are out of the critical path of the ITB packet-processing procedure. The results show very good behaviour of the proposed mechanisms, considerably reducing or even completely removing the latency overhead",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings 10th Euromicro Workshop on Parallel, Distributed and Network-based Processing",
	keywords = "buffer storage;delays;performance evaluation;telecommunication network routing;workstation clusters;",
	note = "latency overhead removal;in-transit buffer mechanism;workstation clusters;source routing;network performance;irregular network topology;short messages;network loads;simulation;latency penalty;critical path;packet processing procedure;",
	pages = "463 - 70",
	title = "{R}emoving the latency overhead of the {ITB} mechanism in {COW}s with source routing",
	url = "http://dx.doi.org/10.1109/EMPDP.2002.994334",
	year = 2002
}

Xavier Molero and Vicente Santonja. Simulating layer-4 load balancing strategies in Web clusters. 2002, 229 - 36. BibTeX

@conference{ 7447673,
	author = "Molero, Xavier and Santonja, Vicente",
	abstract = "Nowadays there exists an explosive demand of Web services. Clusters of Web servers (Web clusters), connected by a fast LAN, are emerging as an alternative for building highly scalable and high available Web services. They are scalable, reliable and cost-effective. However, there are a lot of challenges that must be addressed to tune and increase the performance of this kind of systems. More precisely, the way workload is distributed among servers is crucial factor to global performance. Performance evaluation may be based on analytical modeling or simulation modeling. The latter is more free and flexible. In order to predict the performance of Web clusters, we have implemented a very flexible and easy to use simulator that takes into account several configuration parameters. This tool can be used for both academical and research purposes. In this paper, the authors present a brief description of the employed simulation language, the queueing model in which the simulator is based on, the main input parameters and output variables, and some aspects about the internal design of the implemented tool. Finally, some experimental results obtained by using this tool are shown",
	address = "San Diego, CA, USA",
	journal = "Modelling and Simulation 2002. 16th European Simulation Multiconference 2002. ESM'2002",
	keywords = "file servers;Internet;local area networks;performance evaluation;queueing theory;virtual machines;",
	note = "layer-4 load balancing strategies;World Wide Web clusters;Internet Web services;Web servers;fast LAN;workload distribution;performance evaluation;analytical modeling;simulation modeling;",
	pages = "229 - 36",
	title = "{S}imulating layer-4 load balancing strategies in {W}eb clusters",
	year = 2002
}

M E Acacio, J Gonzalez, J M Garcia and Jose Duato. The use of prediction for accelerating upgrade misses in cc-NUMA multiprocessors. 2002, 155 - 64. URL BibTeX

@conference{ 7503575,
	author = "M.E. Acacio and J. Gonzalez and J.M. Garcia and Duato, Jose",
	abstract = "This work is focused on accelerating upgrade misses in cc-NUMA multiprocessors. These misses are caused by store instructions for which a read-only copy of the line is found in the L2 cache. Upgrade misses require a message sent from the missing node to the directory, a directory lookup in order to find the set of sharers, invalidation messages being sent to the sharers and responses to the invalidations being sent back. Therefore, the penalty paid by these misses is not negligible, mainly if we consider that they account for a high percentage of the total miss rate. We propose the use of prediction as a means of providing cc-NUMA multiprocessors with a more efficient support for upgrade misses by directly invalidating sharers from the missing node. Our proposal comprises an effective prediction scheme achieving high hit rates as well as a coherence protocol extended to support the use of prediction. Our work is motivated by two key observations: first, upgrade misses present a repetitive behavior and, second, the total number of sharers being invalidated is small (one, in some cases). Using execution-driven simulations, we show that the use of prediction can significantly accelerate upgrade misses (latency reductions of more than 40% in some cases). These important improvements translate into speed-ups on application performance up to 14%. Finally, these results can be obtained including a predictor with a total size of less than 48 KB in every node",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings 2002 International Conference on Parallel Architectures and Compilation Techniques. PACT 2002",
	keywords = "cache storage;delays;memory protocols;shared memory systems;",
	note = "prediction;upgrade miss acceleration;cc-NUMA multiprocessors;L2 cache;direct invalidation;coherence protocol;repetitive behavior;sharers;execution-driven simulations;latency reductions;",
	pages = "155 - 64",
	title = "{T}he use of prediction for accelerating upgrade misses in cc-{NUMA} multiprocessors",
	url = "http://dx.doi.org/10.1109/PACT.2002.1106014",
	year = 2002
}

Jose Duato, Antonio Robles, Federico Silla and R Beivide. A Comparison of Router Architectures for Virtual Cut-Through and Wormhole Switching in a NOW Environment. Journal of Parallel and Distributed Computing 61(2):224 - 253, 2001. URL BibTeX

@article{ 2004488488316,
	author = "Duato, Jose and Robles, Antonio and Silla, Federico and R. Beivide",
	abstract = "Most multicomputer interconnection networks use wormhole switching, leading to fast and compact routers. Current routers incorporate virtual channels and even fully adaptive routing. Networks of workstations (NOWs) inherited multicomputer technology. Most commercial routers designed for NOWs implement wormhole switching. However, wormhole switching is not well suited for NOWs. The long wires required in this environment lead to large buffers to prevent buffer overflow during flow control signaling. Moreover, wire length is limited by buffer size. Virtual cut-through (VCT) achieves a higher throughput than wormhole switching. However, buffer requirements and packetizing overhead prevented its widespread use in multicomputers. Nevertheless, wormhole and VCT switching require similar buffer capacity in NOWs. Moreover, some messaging layers such as Illinois Fast Messages (FM) and BIP split messages into packets for increased performance. Therefore, the traditional disadvantages of VCT switching disappear in NOWs. In this paper, we show that VCT routers can be simpler than wormhole routers, while still achieving the advantages of using virtual channels and adaptive routing. We also propose a fully adaptive routing algorithm for VCT switching in a NOW environment. Moreover, we show that VCT routers outperform wormhole routers in a NOW environment at a lower cost. Also, VCT routers require buffer capacity independent of wire length, making them suitable for networks of workstations. © 2001 Academic Press.",
	address = "Orlando, United States",
	issn = 07437315,
	journal = "Journal of Parallel and Distributed Computing",
	number = 2,
	pages = "224 - 253",
	title = "{A} {C}omparison of {R}outer {A}rchitectures for {V}irtual {C}ut-{T}hrough and {W}ormhole {S}witching in a {NOW} {E}nvironment",
	url = "http://dx.doi.org/10.1006/jpdc.2000.1679",
	volume = 61,
	year = 2001
}

Elvira Baydal, Pedro Lopez and Jose Duato. A congestion control mechanism for wormhole networks. In Parallel and Distributed Processing, 2001. Proceedings. Ninth Euromicro Workshop on. 2001, 19 -26. URL, DOI BibTeX

@conference{ 904965,
author = "Baydal, Elvira and Lopez, Pedro and Duato, Jose",
abstract = "Deadlock avoidance and recovery techniques suffer from severe performance degradation when the network is close to or beyond saturation. Many parallel applications produce bursty traffic that may saturate the network during some intervals, and increase execution time. Therefore, the use of techniques that prevent network saturation are of crucial importance in both deadlock avoidance and recovery strategies. Several mechanisms have been proposed in the literature to reach this goal. However some of them do not work well under all network load conditions. Others introduce some penalty when the network is not fully saturated, or complicate network and/or node implementation. In this paper we propose a new mechanism to avoid network saturation that overcomes these drawbacks. In this mechanism, each node estimates network traffic locally by using the percentage of free virtual output channels that can be used for forwarding a message towards its destination. When this number surpasses a threshold value, network congestion is assumed to exist and message injection is forbidden",
booktitle = "Parallel and Distributed Processing, 2001. Proceedings. Ninth Euromicro Workshop on",
doi = "10.1109/EMPDP.2001.904965",
keywords = "bursty traffic;congestion control mechanism;deadlock avoidance;deadlock recovery;free virtual output channels;message injection;network congestion;network load conditions;network saturation;network traffic;performance degradation;threshold value;wormhole",
pages = "19 -26",
title = "{A} congestion control mechanism for wormhole networks",
url = "http://dx.doi.org/10.1109/EMPDP.2001.904965",
year = 2001
}

Elvira Baydal, Pedro Lopez and Jose Duato. A congestion control mechanism for wormhole networks. 2001, 19 - 26. URL BibTeX

@conference{ 6867163,
	author = "Baydal, Elvira and Lopez, Pedro and Duato, Jose",
	abstract = "Deadlock avoidance and recovery techniques suffer from severe performance degradation when the network is close to or beyond saturation. Many parallel applications produce bursty traffic that may saturate the network during some intervals, and increase execution time. Therefore, the use of techniques that prevent network saturation are of crucial importance in both deadlock avoidance and recovery strategies. Several mechanisms have been proposed in the literature to reach this goal. However some of them do not work well under all network load conditions. Others introduce some penalty when the network is not fully saturated, or complicate network and/or node implementation. In this paper we propose a new mechanism to avoid network saturation that overcomes these drawbacks. In this mechanism, each node estimates network traffic locally by using the percentage of free virtual output channels that can be used for forwarding a message towards its destination. When this number surpasses a threshold value, network congestion is assumed to exist and message injection is forbidden",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings Ninth Euromicro Workshop on Parallel and Distributed Processing",
	keywords = "multiprocessor interconnection networks;network routing;performance evaluation;system recovery;telecommunication congestion control;",
	note = "congestion control mechanism;wormhole networks;deadlock avoidance;deadlock recovery;performance degradation;bursty traffic;network saturation;network load conditions;network traffic;free virtual output channels;threshold value;network congestion;message injection;",
	pages = "19 - 26",
	title = "{A} congestion control mechanism for wormhole networks",
	url = "http://dx.doi.org/10.1109/EMPDP.2001.904965",
	year = 2001
}

Juan Miguel Martínez, Pedro Lopez and Jose Duato. A cost-effective approach to deadlock handling in wormhole networks. Parallel and Distributed Systems, IEEE Transactions on 12(7):716 -729, July 2001. URL, DOI BibTeX

@article{ 940746,
	author = "Mart{\'i}nez, Juan Miguel and Lopez, Pedro and Duato, Jose",
	abstract = "Wormhole networks have traditionally used deadlock avoidance strategies. More recently, deadlock recovery strategies have begun to gain acceptance. In particular, progressive deadlock recovery techniques allocate a few dedicated resources to quickly deliver deadlocked packets. Deadlock recovery is based on the assumption that deadlocks are rare; otherwise, recovery techniques are not efficient. Measurements of deadlock occurrence frequency show that deadlocks are highly unlikely when enough routing freedom is provided. However, networks are more prone to deadlocks when the network is close to or beyond saturation, causing some network performance degradation. Similar performance degradation behavior at saturation was also observed in networks using deadlock avoidance strategies. In this paper, we take a different approach to handling deadlocks and performance degradation. We propose the use of an injection limitation mechanism that prevents performance degradation near the saturation point and, at the same time, reduces the probability of deadlock to negligible values. We also propose an improved deadlock detection mechanism that uses only local information, detects all deadlocks, and considerably reduces the probability of false deadlock detection over previous proposals. In the rare case when impending deadlock is detected, our proposal consists of using a simple recovery technique that absorbs the deadlocked message at the current node and later reinjects it for continued routing toward its destination. Performance evaluation results show that our new approach to handling deadlock is more efficient than previously proposed techniques",
	doi = "10.1109/71.940746",
	issn = "1045-9219",
	journal = "Parallel and Distributed Systems, IEEE Transactions on",
	keywords = "cost-effective approach;deadlock avoidance;deadlock handling;deadlock occurrence frequency;deadlock recovery;injection limitation mechanism;network performance degradation;performance degradation;performance evaluation;wormhole networks;concurrency contro",
	month = "jul",
	number = 7,
	pages = "716 -729",
	title = "{A} cost-effective approach to deadlock handling in wormhole networks",
	url = "http://dx.doi.org/10.1109/71.940746",
	volume = 12,
	year = 2001
}

Juan Miguel Martínez, Pedro Lopez and Jose Duato. A cost-effective approach to deadlock handling in wormhole networks. IEEE Transactions on Parallel and Distributed Systems 12(7):716 - 729, 2001. URL, DOI BibTeX

@article{ 2001376648866,
	author = "Mart{\'i}nez, Juan Miguel and Lopez, Pedro and Duato, Jose",
	abstract = "Wormhole networks have traditionally used deadlock avoidance strategies. More recently, deadlock recovery strategies have begun to gain acceptance. In particular, progressive deadlock recovery techniques allocate a few dedicated resources to quickly deliver deadlocked packets. Deadlock recovery is based on the assumption that deadlocks are rare; otherwise, recovery techniques are not efficient. Measurements of deadlock occurrence frequency show that deadlocks are highly unlikely when enough routing freedom is provided [36], [32]. However, networks are more prone to deadlocks when the network is close to or beyond saturation, causing some network performance degradation. Similar performance degradation behavior at saturation was also observed in networks using deadlock avoidance strategies [13]. In this paper, we take a different approach to handling deadlocks and performance degradation. We propose the use of an injection limitation mechanism that prevents performance degradation near the saturation point and, at the same time, reduces the probability of deadlock to negligible values. We also propose an improved deadlock detection mechanism that uses only local information, detects all deadlocks, and considerably reduces the probability of false deadlock detection over previous proposals. In the rare case when impending deadlock is detected, our proposal consists of using a simple recovery technique that absorbs the deadlocked message at the current node and later reinjects it for continued routing toward its destination. Performance evaluation results show that our new approach to handling deadlock is more efficient than previously proposed techniques.",
	doi = "10.1109/71.940746",
	issn = "1045-9219",
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Interconnection networks",
	keywords = "Communication channels;Computer system recovery;Multiprocessing programs;Packet networks;",
	note = "Wormhole networks;",
	number = 7,
	pages = "716 - 729",
	title = "{A} cost-effective approach to deadlock handling in wormhole networks",
	url = "http://dx.doi.org/10.1109/71.940746",
	volume = 12,
	year = 2001
}

M B Caminero, C Carrion, F J Quiles, Jose Duato and S Yalamanchili. A cost-effective hardware link scheduling algorithm for the multimedia router (MMR). 2001, 358 - 69. BibTeX

@conference{ 7175396,
	author = "M.B. Caminero and C. Carrion and F.J. Quiles and Duato, Jose and S. Yalamanchili",
	abstract = "The primary objective of the Multimedia Router (MMR) project is the design and implementation of a compact router optimized for multimedia applications. The router is targeted for use in cluster and LAN interconnection networks, which offer different constraints and therefore differing router solutions than WANs. One of the key elements in order to achieve these goals is the scheduling algorithm. The authors have proposed a link/switch scheduling algorithm that is capable of providing different QoS guarantees to flows as needed. This work focuses on the reduction of the hardware complexity necessary to implement such an algorithm. A novel priority algorithm is presented, and its hardware complexity is compared to that of the original proposal",
	address = "Berlin, Germany",
	journal = "Networking - ICN 2001. First International Conference on Networking. Proceedings, Part II (Lecture Notes in Computer Science Vol.2094)",
	keywords = "communication complexity;firmware;LAN interconnection;multimedia communication;performance evaluation;quality of service;scheduling;telecommunication computing;telecommunication network routing;telecommunication switching;",
	note = "multimedia router;cost-effective hardware link scheduling algorithm;optimized compact router;cluster networks;LAN interconnection networks;constraints;link/switch scheduling algorithm;service quality guarantees;hardware complexity reduction;priority algorithm;multimedia communications switching;performance evaluation;",
	pages = "358 - 69",
	title = "{A} cost-effective hardware link scheduling algorithm for the multimedia router ({MMR})",
	year = 2001
}

Salvador Coll, Jose Flich, M P Malumbres, Pedro Lopez, Jose Duato and F J Mora. A first implementation of in-transit buffers on myrinet gm software. In Parallel and Distributed Processing Symposium., Proceedings 15th International. April 2001, 1640 -1647. URL, DOI BibTeX

@conference{ 925150,
author = "Coll, Salvador and Flich, Jose and M.P. Malumbres and Lopez, Pedro and Duato, Jose and F.J. Mora",
abstract = "Clusters of workstations (COWs) are becoming increasingly popular as a cost-effective alternative to parallel computers. In these systems, the interconnection network connects hosts using irregular topologies, providing the wiring flexibility, scalability, and incremental expansion capability required in this environment. Myrinet is the most popular network used to build COWs. It uses source routing with the up*/down* routing algorithm. In previous papers we proposed the In-Transit Buffer (ITB) mechanism that improves network performance by allowing minimal routing, balancing network traffic, and reducing network contention. The mechanism is based on ejecting packets at some intermediate hosts and later re-injecting them into the network. Moreover, the ITB mechanism does not require additional hardware as it can be implemented on the software running at Myrinet network adapters. In this paper, we present a first implementation of the ITB mechanism on Myrinet GM software. We show the changes required in packet format and the modifications performed in the Myrinet Control Program (MCP). In addition, both the overhead introduced by the new code and the cost of extracting and re-injecting packets are measured. Results show that, even for this simple implementation, code overhead is only about 125 ns per packet and the message latency increase for messages that use the ITB mechanismis around 1.3 s per ITB. This is the first attempt to implement this mechanism, showing that a real implementation of ITBs is feasible on Myrinet COWs, and the associated overhead does not restrict the potential benefits of this mechanism.",
booktitle = "Parallel and Distributed Processing Symposium., Proceedings 15th International",
doi = "10.1109/IPDPS.2001.925150",
isbn = "0-7695-0990-8",
issn = "1530-2075",
month = "apr",
pages = "1640 -1647",
title = "{A} first implementation of in-transit buffers on myrinet gm software",
url = "http://dx.doi.org/10.1109/IPDPS.2001.925150",
year = 2001
}

Jose Duato and Timothy Mark Pinkston. A general theory for deadlock-free adaptive routing using a mixed set of resources. IEEE Transactions on Parallel and Distributed Systems 12(12):1219 - 1235, 2001. URL BibTeX

@article{ 2002056842591,
	author = "Duato, Jose and Timothy Mark Pinkston",
	abstract = "This paper presents a theoretical framework for the design of deadlock-free fully adaptive routing algorithms for a general class of network topologies and switching techniques in a single, unified theory. A general theory is proposed that allows the design of deadlock avoidance-based as well as deadlock recovery-based wormhole and virtual cut-through adaptive routing algorithms that use a homogeneous or a heterogeneous (mixed) set of resources. The theory also allows channel queues to be allocated nonatomically, utilizing resources efficiently. A general methodology for the design of fully adaptive routing algorithms applicable to arbitrary network topologies is also proposed. The proposed theory and methodology allow the design of efficient network routers that require minimal resources for handling infrequent deadlocks.",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Parallel processing systems",
	keywords = "Adaptive algorithms;Interconnection networks;Queueing networks;Resource allocation;Routers;",
	note = "Adaptive routing algorithms;Irregular networks;Network routers;Network topologies;Nonatomic queue allocation;Regular networks;",
	number = 12,
	pages = "1219 - 1235",
	title = "{A} general theory for deadlock-free adaptive routing using a mixed set of resources",
	url = "http://dx.doi.org/10.1109/71.970556",
	volume = 12,
	year = 2001
}

M E Acacio, J Gonzalez, J M Garcia and Jose Duato. A new scalable directory architecture for large-scale multiprocessors. 2001, 97 - 106. URL BibTeX

@conference{ 6846670,
	author = "M.E. Acacio and J. Gonzalez and J.M. Garcia and Duato, Jose",
	abstract = "The memory overhead introduced by directories constitutes a major hurdle in the scalability of cc-NUMA architectures, which makes the shared-memory paradigm unfeasible for very large-scale systems. This work is focused on improving the scalability of shared-memory multiprocessors by significantly reducing the size of the directory. We propose multilayer clustering as an effective approach to reduce the directory-entry width. Detailed evaluation for 64 processors shows that using this approach we can drastically reduce the memory overhead, while suffering a performance degradation we similar to previous compressed schemes (such as Coarse Vector). In addition, a novel two-level directory architecture is proposed in order to eliminate the penalty caused by these compressed directories. This organization consists of a small Full-Map first-level directory (which provides precise information for the most recently referenced lines) and a compressed second-level directory (which provides in-excess information). Results show that a system with this directory architecture can achieve the same performance as a multiprocessor with a big and non-scalable Full-Map directory with a very significant reduction of the memory overhead",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings HPCA Seventh International Symposium on High-Performance Computer Architecture",
	keywords = "parallel architectures;performance evaluation;shared memory systems;",
	note = "large-scale multiprocessors;scalable directory architecture;memory overhead;scalability;shared-memory multiprocessors;multilayer clustering;",
	pages = "97 - 106",
	title = "{A} new scalable directory architecture for large-scale multiprocessors",
	url = "http://dx.doi.org/10.1109/HPCA.2001.903255",
	year = 2001
}

J M Orduna, Federico Silla and Jose Duato. A new task mapping technique for communication-aware scheduling strategies. 2001, 349 - 54. URL BibTeX

@conference{ 7075370,
	author = "J.M. Orduna and Silla, Federico and Duato, Jose",
	abstract = "Clusters have become a very cost-effective platform for high-performance computing. In these systems, the trend is towards the interconnection network becoming the system bottleneck. Therefore, in the future, scheduling strategies will have to take into account the communication requirements of the applications and the communication bandwidth that the network can offer. One of the key issues in these strategies is the task mapping technique used when the network becomes the system bottleneck. In this paper, we propose an enhanced version of a previously proposed mapping technique that takes into account not only the existing network resources, but also the traffic generated by the applications. Also, we evaluate the mapping technique using real MPI application traces with timestamps. Evaluation results show that the rise of the new mapping technique fully exploits the available network bandwidth, improving load balancing and increasing the throughput that can be delivered by the network",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings International Conference on Parallel Processing Workshops",
	keywords = "multiprocessor interconnection networks;performance evaluation;processor scheduling;workstation clusters;",
	note = "interconnection network;scheduling;clusters;communication-aware scheduling;mapping technique;MPI application traces;task mapping;",
	pages = "349 - 54",
	title = "{A} new task mapping technique for communication-aware scheduling strategies",
	url = "http://dx.doi.org/10.1109/ICPPW.2001.951971",
	year = 2001
}

R Casado, A Bermudez, Jose Duato, J Quiles and J L Sanchez. A protocol for deadlock-free dynamic reconfiguration in high-speed local area networks. IEEE Transactions on Parallel and Distributed Systems 12(2):115 - 132, 2001. URL BibTeX

@article{ 2001206504242,
	author = "R. Casado and A. Bermudez and Duato, Jose and J. Quiles and J.L. Sanchez",
	abstract = "High-speed local area networks (LANs) consist of a set of switches interconnected by point-to-point links, and hosts linked to those switches through a network interface card. High-speed LANs may change their topology due to switches being turned on/off, hot expansion, link remapping, and component failures. In these cases, a distributed reconfiguration protocol analyzes the topology, computes the new routing tables, and downloads them to the corresponding switches. Unfortunately, in most cases, user traffic is stopped during the reconfiguration process to avoid deadlock. These strategies are called static reconfiguration techniques. Although network reconfigurations are not frequent, static reconfiguration such as this may take hundreds of milliseconds to execute, thus degrading system availability significantly. Several distributed real-time applications have strict communication requirements. Distributed multimedia applications have similar, although less strict, quality of service (QoS) requirements [3], [4]. Both stopping packet transmission and discarding packets due to the reconfiguration process prevent the system from satisfying the above requirements. Therefore, in order to support hard real-time and distributed multimedia applications over a high-speed LAN, we need to avoid stopping user traffic and discarding packets when the topology changes. In this paper, we propose a new deadlock-free distributed reconfiguration protocol that is able to asynchronously update routing tables without stopping user traffic. This protocol is valid for any topology, including regular as well as irregular topologies. It is also valid for packet switching as well as for cut-through switching techniques and does not rely on the existence of virtual channels to work. Simulation results show that the behaviour of our protocol is significantly better than for other protocols based on stopping user traffic.",
	issn = 10459219,
	journal = "IEEE Transactions on Parallel and Distributed Systems",
	key = "Distributed computer systems",
	keywords = "Computer simulation;Computer system recovery;Interconnection networks;Interfaces;Local area networks;Multimedia systems;Network protocols;Packet switching;Quality of service;Real time systems;Telecommunication traffic;",
	note = "Deadlock avoidance;Dynamic reconfiguration;High speed networks;Irregular topologies;Static reconfiguration;System availability;",
	number = 2,
	pages = "115 - 132",
	title = "{A} protocol for deadlock-free dynamic reconfiguration in high-speed local area networks",
	url = "http://dx.doi.org/10.1109/71.910868",
	volume = 12,
	year = 2001
}

Xavier Molero, Federico Silla, Vicente Santonja and Jose Duato. A tool for the design and evaluation of fibre channel storage area networks. 2001, 133 - 140. URL BibTeX

@conference{ 2001296584391,
author = "Molero, Xavier and Silla, Federico and Santonja, Vicente and Duato, Jose",
abstract = "The fast growth of data intensive applications has caused a change in the traditional storage model. The server-to-disk approach, usually implemented with SCSI buses, is being replaced by storage area networks (SANs), which enable storage to be externalized from servers, thus allowing storage devices to be shared among multiple servers. A SAN is a separate network for storage, isolated from the messaging network and optimized for the movement of data between servers and storage devices. Nowadays, most of current SANs use Fibre Channel as the technology to move data between servers and storage devices. In order to design and evaluate the performance of these systems it is necessary to have adequate tools. Usually, performance evaluation may be based on analytical modeling or simulation. Each of them differs in their scope and applicability. However, the simulation modeling technique offers more freedom, flexibility, and accuracy than analytical methods. Thus, when evaluating the performance of SANs, simulation modeling should be used. In this paper we present the main capabilities of a simulator for Fibre Channel SANs, focusing on its input parameters and output variables. We also show several simple examples of performance measurements that can be obtained using this tool.",
address = "Seattle, WA, United states",
issn = 02724715,
journal = "Proceedings of the IEEE Annual Simulation Symposium",
key = "Data storage equipment",
keywords = "Client server computer systems;Communication channels;Computer simulation;Local area networks;Mathematical models;Optimization;",
note = "Fiber channel storage area networks;Multiple servers;",
pages = "133 - 140",
title = "{A} tool for the design and evaluation of fibre channel storage area networks",
url = "http://dx.doi.org/10.1109/SIMSYM.2001.922125",
year = 2001
}

Salvador Petit, Julio Sahuquillo and A Pont. About the sensitivity of the HLRC-DU protocol on diff and page sizes. In Performance Analysis of Systems and Software, 2001. ISPASS. 2001 IEEE International Symposium on. 2001, 45 -48. URL, DOI BibTeX

@conference{ 990675,
	author = "Petit, Salvador and Sahuquillo, Julio and A. Pont",
	booktitle = "Performance Analysis of Systems and Software, 2001. ISPASS. 2001 IEEE International Symposium on",
	doi = "10.1109/ISPASS.2001.990675",
	pages = "45 -48",
	publisher = "IEEE Computer Society Press",
	title = "{A}bout the sensitivity of the {HLRC}-{DU} protocol on diff and page sizes",
	url = "http://dx.doi.org/10.1109/ISPASS.2001.990675",
	year = 2001
}

Vicente Chirivella, Rosa Alcover and Jose Duato. Accurate reliability and availability models for direct interconnection networks. In Parallel Processing, International Conference on, 2001.. September 2001, 517 - 24. URL, DOI BibTeX

@conference{ 7075325,
	author = "Chirivella, Vicente and Alcover, Rosa and Duato, Jose",
	abstract = "Fault tolerance in multicomputer interconnection networks has been traditionally studied by determining the worst possible combination of faulty components that causes its failure and then assuming that this will occur. But, the probability of the worst possible combination is usually low, and the routing algorithm may be able to find a route between source and destination nodes. The network dependability parameters computed according to this approach will be underestimated. In this paper we propose a methodology for accurately evaluating interconnection network dependability. In addition, we apply it to obtain an accurate estimation of the reliability and availability parameters in a 2-D mesh, taking into account network size, routing algorithm, failure and repair rates of nodes, and coverage. Finally we compare the computed results under both approaches",
	booktitle = "Parallel Processing, International Conference on, 2001.",
	doi = "10.1109/ICPP.2001.952099",
	isbn = "0-7695-1257-7",
	journal = "Proceedings International Conference on Parallel Processing",
	keywords = "fault tolerant computing;multiprocessor interconnection networks;network routing;",
	month = "Sep",
	note = "accurate reliability;availability models;direct interconnection networks;fault tolerance;multicomputer interconnection networks;faulty components;routing algorithm;network dependability parameters;network size;",
	pages = "517 - 24",
	title = "{A}ccurate reliability and availability models for direct interconnection networks",
	url = "http://dx.doi.org/10.1109/ICPP.2001.952099",
	year = 2001
}

Pedro Lopez, Jose Flich and Jose Duato. Deadlock-free routing in InfiniBandTM through destination renaming. In Parallel Processing, International Conference on, 2001.. 2001, 427 - 434. DOI BibTeX

@conference{ 952089,
	author = "Lopez, Pedro and Flich, Jose and Duato, Jose",
	abstract = "The InfiniBand Architecture (IBA) defines a switch-based network with point-to-point links that supports any topology defined by the user including irregular ones, in order to provide flexibility and incremental expansion capability. Routing in IBA is distributed, based on forwarding tables, and only considers the packet destination ID for routing within subnets in order to drastically reduce forwarding table size. Unfortunately, the forwarding tables for most of the previously proposed routing algorithms for irregular topologies consider both the destination ID and the input channel. Therefore, these popular routing algorithms for irregular topologies may not be usable in InfiniBand networks because they do nor conform to the IBA specifications. In this paper we propose an easy-to-implement strategy to adapt the forwarding tables already computed following any routing algorithm that considers the destination ID and the input channel into the required IBA forwarding table format. The resulting routing algorithm is deadlock-free on IBA. Indeed, the originally computed paths are not modified at all. Hence, the proposed strategy does not degrade performance with respect to the original routing scheme.",
	booktitle = "Parallel Processing, International Conference on, 2001.",
	doi = "10.1109/ICPP.2001.952089",
	issn = "",
	keywords = "InfiniBand Architecture; deadlock-free; destination renaming; packet destination; routing algorithms; switch-based network; multiprocessor interconnection networks; network routing;",
	month = "3-7",
	pages = "427 - 434",
	title = "{D}eadlock-free routing in {I}nfini{B}and{TM} through destination renaming",
	year = 2001
}

J C Sancho, Antonio Robles and Jose Duato. Effective strategy to compute forwarding tables for infiniBand networks. 2001, 48 - 57. BibTeX

@conference{ 7081877,
	author = "J.C. Sancho and Robles, Antonio and Duato, Jose",
	abstract = "InfiniBand is very likely to become the facto standard for communication between processing nodes and I/O devices as well as for interprocessor communication. The InifiniBand Architecture (IBA) defines a switch-based network with point-to-point links that support any topology defined by the user. Routing in IBA is distributed based on forwarding tables, and only considers the packet destination ID for routing within subnets. Up*/down* routing is the simplest and most popular routing algorithm for irregular topologies. Unfortunately, up*/down* routing cannot be used in IBA switches because it may leads to deadlock. In this paper we address this issue, proposing an easy-to-implement strategy to complete up*/down* forwarding tables for IBA switches that guarantees deadlock freedom, and is effective whatever the methodology applied to compute up*/down* routing tables. Preliminary evaluation results modeling an InfiniBand network at register transfer level show that the proposed strategy allows up*/down* routing algorithms to be implemented on InfiniBand networks with minimal performance degradation",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings International Conference on Parallel Processing",
	keywords = "concurrency control;multiprocessor interconnection networks;network routing;performance evaluation;",
	note = "forwarding tables;infiniBand networks;processing nodes;I/O devices;interprocessor communication;switch-based network;point-to-point links;packet destination;easy-to-implement strategy;register transfer level;minimal performance degradation;",
	pages = "48 - 57",
	title = "{E}ffective strategy to compute forwarding tables for infini{B}and networks",
	year = 2001
}

JC Sancho, Antonio Robles and Jose Duato. Effective strategy to compute forwarding tables for InfiniBand networks. In LM Ni and M Valero (eds.). PROCEEDINGS OF THE 2001 INTERNATIONAL CONFERENCE ON PARALLEL PROCESSING. 2001, 48-57. BibTeX

@conference{ isi:000171882100006,
	author = "JC Sancho and Robles, Antonio and Duato, Jose",
	abstract = "InfiniBand is very likely to become the facto standard for communication between processing nodes and I/O devices as well as for interprocessor communication. The InifiniBand Architecture (IBA) defines a switch-based network with point-to-point links that support any topology defined by the user Routing in IBA is distributed, based on forwarding tables, and only considers the packet destination ID for routing within subnets. Up{*}/down{*} routing is the simplest and most popular routing algorithm for irregular topologies. Unfortunately, up{*}/down{*} routing cannot be used in IBA switches because it may leads to deadlock. In this paper we address this issue, proposing an easy-to-implement strategy to compute up{*}/down{*} forwarding tables for IBA switches that guarantees deadlock freedom, and is effective whatever the methodology applied to compute up{*}/down{*} routing tables. Preliminary evaluation results modeling an InfiniBand network at register transfer level show that the proposed strategy, allows up{*}/down{*} routing algorithms to be implemented on InfiniBand networks with minimal performance degradation.",
	booktitle = "PROCEEDINGS OF THE 2001 INTERNATIONAL CONFERENCE ON PARALLEL PROCESSING",
	editor = "Ni, LM and Valero, M",
	isbn = 0769512585,
	issn = "0190-3918",
	note = "30th International Conference on Parallel Processing (ICPP 01), VALENCIA, SPAIN, SEP 03-07, 2001",
	pages = "48-57",
	series = "PROCEEDINGS OF THE INTERNATIONAL CONFERENCE ON PARALLEL PROCESSING",
	title = "{E}ffective strategy to compute forwarding tables for {I}nfini{B}and networks",
	year = 2001
}

E Moyano, F J Quiles, A Garrido, T Orozco-Barbosa and Jose Duato. Efficient 3D wavelet transform decomposition for video compression. 2001, 118 - 25. URL BibTeX

@conference{ 7005751,
	author = "E. Moyano and F.J. Quiles and A. Garrido and T. Orozco-Barbosa and Duato, Jose",
	abstract = "We present an efficient three-dimensional wavelet transform (3D-WT) algorithm for video compression. This algorithm performs the temporal decomposition of a video sequence in a more efficient way than the classical 3D-WT algorithm. We have conducted a set of experimental evaluations of the proposed algorithm using various video sequences. Experimental results show that our algorithm exhibits lower memory demands and lower latencies for the compression and decompression processes than the classical algorithm at the same compression ratio",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings Second International Workshop on Digital and Computational Video",
	keywords = "data compression;image sequences;transform coding;video coding;wavelet transforms;",
	note = "3D wavelet transform;video compression;three-dimensional wavelet transform;3D-WT;temporal decomposition;video sequences;memory demands;latencies;decompression;",
	pages = "118 - 25",
	title = "{E}fficient 3{D} wavelet transform decomposition for video compression",
	url = "http://dx.doi.org/10.1109/DCV.2001.929950",
	year = 2001
}

Xavier Molero, Federico Silla, Vicente Santonja and Jose Duato. Improving network performance by efficiently dealing with short control messages in fibre channel SANs. 2001, 901 - 10. BibTeX

@conference{ 7219763,
	author = "Molero, Xavier and Silla, Federico and Santonja, Vicente and Duato, Jose",
	abstract = "Traffic in a storage area networks (SANs) is bimodal, composed of long messages carrying several KBytes of data, and short messages containing control information (I/O commands). From the network point of view, latency of control messages is highly affected by the transmission of data messages, due to their length. As a consequence, it is necessary to establish management policies that benefit the transmission of short control messages, thus reducing the overall response time for I/O operations and increasing network throughput. We propose several strategies for dealing with short control messages and analyze their impact on the performance of storage area networks. This analysis is carried out for a fully adaptive routing algorithm in the context of two different network topology environments: buildings and departments. Simulation results show that both I/O response time and network throughput may be improved when efficiently managing control messages",
	address = "Berlin, Germany",
	journal = "Euro-Par 2001 Parallel Processing. 7th International Euro-Par Conference. Proceedings (Lecture Notes in Computer Science Vol.2150)",
	keywords = "digital storage;local area networks;",
	note = "network performance;short control messages;fibre channel SANs;storage area networks;bimodal traffic;latency;data messages;management policies;response time;I/O operations;network topology environments;",
	pages = "901 - 10",
	title = "{I}mproving network performance by efficiently dealing with short control messages in fibre channel {SAN}s",
	year = 2001
}

Jose Flich, Pedro Lopez, M P Malumbres, Jose Duato and T Rokicki. Improving network performance by reducing network contention in source-based COWS with a low path-computation overhead. In Parallel and Distributed Processing Symposium., Proceedings 15th International. April 2001, 8 pp.. DOI BibTeX

@conference{ 925016,
	author = "Flich, Jose and Lopez, Pedro and M.P. Malumbres and Duato, Jose and T. Rokicki",
	abstract = "In previous papers, we have proposed the in-transit buffer mechanism (ITB) to improve network performance in COWs with irregular topology and source routing. This mechanism allows the use of minimal paths among all hosts, breaking cyclic dependences between channels by storing and later re-injecting packets at some intermediate hosts. However it also has two additional features that can improve even more network performance. First, the ITB mechanism reduces network contention because some messages are ejected from the network freeing network links. Second the ITB mechanism allows the use of any path between each source-destination pair improving traffic balance. In this paper we present a new routing algorithm that takes advantage of ITB by exploiting both issues: traffic balance and network contention reduction. The evaluation results show that network throughput can be considerably improved. On average, network throughput increases with respect to up*/down* by factors of 2.51 and 3.77 in 32 and 64-switch networks, respectively",
	booktitle = "Parallel and Distributed Processing Symposium., Proceedings 15th International",
	doi = "10.1109/IPDPS.2001.925016",
	keywords = "in-transit buffer mechanism;network contention;network performance;network throughput;source routing;source-based COWS;traffic balance;performance evaluation;workstation clusters;",
	month = "apr",
	pages = "8 pp.",
	title = "{I}mproving network performance by reducing network contention in source-based {COWS} with a low path-computation overhead",
	year = 2001
}

Rosa Alcover, Vicente Chirivella and Jose Duato. Improving the accuracy of reliability models for direct interconnection networks. In Rizos Sakellariou; John Gurd; Len Freeman; John Keane (ed.). Euro-Par 2001 Parallel Processing 2150. August 2001, 621 - 629. URL, DOI BibTeX

@conference{ 7211185,
	author = "Alcover, Rosa and Chirivella, Vicente and Duato, Jose",
	abstract = "Fault-tolerance in multicomputer interconnection networks has been traditionally studied by determining the worst possible combination of faulty components that causes a network failure and then assuming that this will occur. But the worst possible combination may occur with low probability and the routing algorithm may allow the network to work, even when there is a large number of faults. Thus, the network dependability parameters computed according to this approach will be underestimated. Previously (V. Chirivella and R. Alcover, 2000), we proposed a new methodology based on Markov chains, for evaluating interconnection network dependability. Using this methodology, we can accurately compute the network reliability behavior. We apply it to evaluate dependability parameters in a 2-D mesh, taking into account network size, routing algorithm, failure and repair rates of nodes and coverage. Finally, we compare the computed results to a traditional approach",
	address = "Berlin, Germany",
	booktitle = "Euro-Par 2001 Parallel Processing",
	doi = "10.1007/3-540-44681-8_89",
	editor = "Rizos Sakellariou; John Gurd; Len Freeman; John Keane",
	isbn = "978-3-540-42495-6",
	journal = "Euro-Par 2001 Parallel Processing. 7th International Euro-Par Conference. Proceedings (Lecture Notes in Computer Science Vol.2150)",
	keywords = "fault tolerant computing;Markov processes;multiprocessor interconnection networks;network routing;",
	month = "Aug",
	note = "reliability model accuracy;direct interconnection networks;fault-tolerance;multicomputer interconnection networks;worst possible combination;faulty components;network failure;probability;routing algorithm;network dependability parameters;Markov chains;interconnection network dependability;network reliability behavior;dependability parameters;2D mesh;network size;repair rates;failure rates;",
	pages = "621 - 629",
	publisher = "Springer",
	series = "Lecture Notes in Computer Science",
	title = "{I}mproving the accuracy of reliability models for direct interconnection networks",
	url = "http://dx.doi.org/10.1007/3-540-44681-8_89",
	volume = 2150,
	year = 2001
}

A Perles, Xavier Molero, A Marti, Vicente Santonja and J J Serrano. Improving the execution of groups of simulations on a cluster of workstations and its application to storage area networks. 2001, 227 - 234. URL BibTeX

@conference{ 2001296584402,
	author = "A. Perles and Molero, Xavier and A. Marti and Santonja, Vicente and J.J. Serrano",
	abstract = "Parallel simulation methods can be used to reduce the execution time of simulations of complex systems. This approach is being used to improve the execution time of a storage area network (SAN) simulator designed in our department. From our experience in planning simulation experiments, we have realized that, in most cases, a simulation experiment (group of simulations) is executed while varying only one input variable, which usually corresponds to the input workload or a configuration model parameter. In this paper we propose two methods to reduce the overall execution time of a simulation experiment using a cluster of workstations. The first method uses the first simulation in order to tune the rest of the remaining work to be done in the experiment. The second method, based in the first one, tries to minimize the negative influence of the initial transient period by chaining the simulations in the experiment. We show that these two methods noticeably decrease the overall execution time needed to run the simulations that compose the experiment.",
	address = "Seattle, WA, United states",
	issn = 02724715,
	journal = "Proceedings of the IEEE Annual Simulation Symposium",
	key = "Computer simulation",
	keywords = "Computer networks;Computer workstations;Digital storage;Large scale systems;Parallel processing systems;Time series analysis;",
	note = "Execution time;Storage area networks;",
	pages = "227 - 234",
	title = "{I}mproving the execution of groups of simulations on a cluster of workstations and its application to storage area networks",
	url = "http://dx.doi.org/10.1109/SIMSYM.2001.922136",
	year = 2001
}

R Casado, A Bermudez, F J Quiles and Jose Duato. Influence of network size and load on the performance of reconfiguration protocols. 2001, 46 - 57. URL BibTeX

@conference{ 7114036,
	author = "R. Casado and A. Bermudez and F.J. Quiles and Duato, Jose",
	abstract = "Switched point-to-point interconnection networks provide the high bandwidth and low latency required by current distributed applications. When the topology changes, a reconfiguration of the routing tables is performed to maintain network connectivity. In order to prevent deadlock, traditional reconfiguration schemes discard application traffic during the reconfiguration process. The consequence is that the network cannot provide the bandwidth demanded by user applications. In order to solve this problem, we proposed two deadlock-free schemes that allow traffic through the network while the reconfiguration is being performed By using these schemes, the network is able to fulfill the applications requirements. In this paper, we evaluate these traditional and novel reconfiguration schemes. In particular, we analyze the impact of network size and load on their behavior. Application traffic has been modeled by means of a self-similar pattern. Simulation results clearly show the large performance degradation associated with the traditional approach and the significant benefits that can be obtained by using dynamic reconfiguration techniques",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings IEEE International Symposium on Network Computing and Applications. NCA 2001",
	keywords = "local area networks;multiprocessor interconnection networks;performance evaluation;protocols;reconfigurable architectures;workstation clusters;",
	note = "interconnection networks;network connectivity;reconfiguration schemes;deadlock-free schemes;dynamic reconfiguration;point-to-point interconnection networks;system area networks;networks of workstations;reconfiguration protocol;performance evaluation;protocol;",
	pages = "46 - 57",
	title = "{I}nfluence of network size and load on the performance of reconfiguration protocols",
	url = "http://dx.doi.org/10.1109/NCA.2001.962515",
	year = 2001
}

Manuel E Acacio, Jose Gonzalez, Jose M Garcia and Jose Duato. New scalable directory architecture for large-scale multiprocessors. 2001, 97 - 106. BibTeX

@conference{ 2001385584315,
	author = "Manuel E. Acacio and Jose Gonzalez and Jose M. Garcia and Duato, Jose",
	abstract = "The memory overhead introduced by directories constitutes a major hurdle in the scalability of cc-NUMA architectures, which makes the shared-memory paradigm unfeasible for very large-scale systems. This work is focused on improving the scalability of shared-memory multiprocessors by significantly reducing the size of the directory. We propose multilayer clustering as an effective approach to reduce the directory-entry width. Detailed evaluation for 64 processors shows that using this approach we can drastically reduce the memory overhead, while suffering a performance degradation very similar to previous compressed schemes (such as Coarse Vector). In addition, a novel two-level director), architecture is proposed in order to eliminate the penalty caused by these compressed directories. This organization consists of a small Full-Map first-level directory (which provides precise information for the most recently referenced lines) and a compressed second-level directory (which provides in-excess information). Results show that a system with this directory architecture can achieve the same performance as a multiprocessor with a big and non-scalable Full-Map directory, with a very significant reduction of the memory overhead.",
	address = "Nuevo Leon, Mex",
	journal = "IEEE High-Performance Computer Architecture Symposium Proceedings",
	key = "Multiprocessing systems",
	keywords = "Computer architecture;Data storage equipment;Program processors;Storage allocation (computer);",
	note = "Multilayer clustering;Shared-memory multiprocessors;",
	pages = "97 - 106",
	title = "{N}ew scalable directory architecture for large-scale multiprocessors",
	year = 2001
}

Xavier Molero, Federico Silla, Vicente Santonja and Jose Duato. On the impact of message packetization in networks of workstations with irregular topology. 2001, 3 - 10. URL BibTeX

@conference{ 6867161,
	author = "Molero, Xavier and Silla, Federico and Santonja, Vicente and Duato, Jose",
	abstract = "Networks of workstations (NOWs) are becoming an increasingly popular alternative to parallel computers for those applications with high needs of resources such as memory capacity and input/output storage space, and also for small scale parallel computing. Usually, the software messaging layers in these systems become a bottleneck due to the overhead they introduce. Some proposals like FM and BIP considerably reduce this overhead by splitting long messages into several packets. These proposals have been shown to improve communication performance. However, the effect of message packetization on the network interconnects has not been analyzed yet. In this paper we examine the effect of message packetization from the point of view of the interconnection network in the context of bimodal traffic. Two different routing algorithms have been considered: up*/down* and minimal adaptive routing. Our study shows that when the up */down* routing algorithm is used, message packetization dramatically increases latency and reduces throughput for both long and short messages. On the other hand, if minimal adaptive routing is used, short messages could benefit from message packetization, but at the cost of increasing latency for long messages. In any case, network throughput is considerably reduced",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings Ninth Euromicro Workshop on Parallel and Distributed Processing",
	keywords = "multiprocessor interconnection networks;network routing;performance evaluation;workstation clusters;",
	note = "message packetization;networks of workstations;irregular topology;resources;memory capacity;input/output storage space;software messaging layers;interconnection network;bimodal traffic;routing algorithms;minimal adaptive routing;latency;",
	pages = "3 - 10",
	title = "{O}n the impact of message packetization in networks of workstations with irregular topology",
	url = "http://dx.doi.org/10.1109/EMPDP.2001.904960",
	year = 2001
}

JC Sancho, Antonio Robles and Jose Duato. On the relative behavior of source and distributed routing in NOWs using up*/down* routing schemes. In K Klockner (ed.). NINTH EUROMICRO WORKSHOP ON PARALLEL AND DISTRIBUTED PROCESSING, PROCEEDINGS. 2001, 11-18. BibTeX

@conference{ isi:000166833400002,
	author = "JC Sancho and Robles, Antonio and Duato, Jose",
	abstract = "Networks of workstations (NOWs) are arranged as a switch-based network with irregular topology which makes routing and deadlock avoidance quite complicated. Current proposals use the up{*}/down{*} routing algorithm to remove cyclic dependencies between channels and avoid deadlock. Recently a simple and effective methodology to compute up{*}/down{*} routing tables has been proposed by us. The resulting up{*}/down{*} routing scheme increases the number of alternative paths between every pair of switches and allows most messages to follow minimal paths. Also, up{*}/down{*} routing is suitable to be implemented using source or distributed routing. Source routing provides a safer and lower cost implementation of up{*}/down{*} routing than that provided by distributed routing. However distributed routing may benefit from routing messages through alternative paths to reach their destination. In this paper we evaluate the performance of up{*}/down{*} routing when using two methodologies to compute routing tables, and when both source and distributed rousing are used. Evaluation results show that it is not worth to implement up{*}/down{*} routing in a distributed way in a NOW environment, since its performance is ver), close to that achieved by implementing it with source routing when a traffic-balancing algorithm is used. Moreover it is shown that a greater improvement in performance can be achieved by modifying the method to compute up{*}/down{*} routing tables when source routing is used.",
	booktitle = "NINTH EUROMICRO WORKSHOP ON PARALLEL AND DISTRIBUTED PROCESSING, PROCEEDINGS",
	editor = "Klockner, K",
	isbn = 0769509886,
	note = "9th Euromicro Workshop on Parallel and Distributed Processing, MANTOVA, ITALY, FEB 07-09, 2001",
	pages = "11-18",
	title = "{O}n the relative behavior of source and distributed routing in {NOW}s using up{*}/down{*} routing schemes",
	year = 2001
}

J C Sancho, Antonio Robles and Jose Duato. On the relative behavior of source and distributed routing in NOWs using Up*/Down* routing schemes. 2001, 11 - 18. URL BibTeX

@conference{ 6867162,
	author = "J.C. Sancho and Robles, Antonio and Duato, Jose",
	abstract = "Networks of workstations (NOWs) are arranged as a switch-based network with irregular topology, which makes routing and deadlock avoidance quite complicated. Current proposals use the up*/down* routing algorithm to remove cyclic dependencies between channels and avoid deadlock. Recently, a simple and effective methodology to compute up*/down* routing tables has been proposed by us. The resulting up*/down* routing scheme increases the number of alternative paths between every pair of switches and allows most messages to follow minimal paths. Also, up*/down* routing is suitable to be implemented using source or distributed routing. Source routing provides a safer and lower cost implementation of up*/down* routing than that provided by distributed routing. However distributed routing may benefit from routing messages through alternative paths to reach their destination. In this paper we evaluate the performance of up*/down* routing when using two methodologies to compute routing tables, and when both source and distributed routing are used. Evaluation results show that it is not worth to implement up*/down* routing in a distributed way in a NOW environment, since its performance is very close to that achieved by implementing it with source routing when a traffic-balancing algorithm is used. Moreover it is shown that a greater improvement in performance can be achieved by modifying the method to compute up*/down* routing tables when source routing is used",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings Ninth Euromicro Workshop on Parallel and Distributed Processing",
	keywords = "network routing;performance evaluation;system recovery;workstation clusters;",
	note = "distributed routing;NOWs;Up*/Down* routing schemes;networks of workstations;switch-based network;irregular topology;deadlock avoidance;cyclic dependencies;lower cost implementation;performance;",
	pages = "11 - 18",
	title = "{O}n the relative behavior of source and distributed routing in {NOW}s using {U}p*/{D}own* routing schemes",
	url = "http://dx.doi.org/10.1109/EMPDP.2001.904962",
	year = 2001
}

Xavier Molero, Federico Silla, Vicente Santonja and Jose Duato. On the scalability of topologies for storage area networks in building environments. 2001, 332 - 5. URL BibTeX

@conference{ 7114065,
	author = "Molero, Xavier and Silla, Federico and Santonja, Vicente and Duato, Jose",
	abstract = "Nowadays, the fast growth of data intensive applications is changing the way storage is devised. The traditional server-to-disk approach is being replaced by storage area networks (SANs), which are a separate network for storage, isolated from the messaging network and optimized for the movement of data between servers and storage devices (usually disks). We analyze the performance and cost scalability of a family of network topologies devised to be used in building environments. Performance simulation results combined with cost estimations have revealed that slight modifications in network topology can affect the overall scalability. In particular wraparound links connecting the lowest and highest floors in the building significantly affect the scalability of the network. Anyway, the use of this kind of links by itself does not provide the best solution. It is also necessary to have a good interconnection pattern in the backbone",
	address = "Los Alamitos, CA, USA",
	journal = "Proceedings IEEE International Symposium on Network Computing and Applications. NCA 2001",
	keywords = "digital storage;local area networks;network topology;telecommunication network routing;",
	note = "storage area networks;building environments;data intensive applications;servers;storage devices;cost scalability;performance simulation;cost estimations;network topology;wraparound links;interconnection pattern;backbone;",
	pages = "332 - 5",
	title = "{O}n the scalability of topologies for storage area networks in building environments",
	url = "http://dx.doi.org/10.1109/NCA.2001.962549",
	year = 2001
}

Xavier Molero, Federico Silla, Vicente Santonja and Jose Duato. On the switch architecture for fibre channel storage area networks. 2001, 484 - 491. URL BibTeX

@conference{ 2001416673902,
	author = "Molero, Xavier and Silla, Federico and Santonja, Vicente and Duato, Jose",
	abstract = "The fast growth of data intensive applications has caused a change in the traditional storage model. The server-to-disk approach is being replaced by storage area networks (SANs), which enable storage to be externalized from servers, thus allowing storage devices to be shared among multiple servers. Nowadays, the majority of SANs use Fibre Channel. The standard for Fibre Channel defines several issues related to the switch interface, but does not make any suggestion about the internal switch architecture to be implemented by manufacturers. In this paper we analyze the key architectural switch characteristics for building Fibre Channel storage area networks. To do so, our starting point is the performance analysis of two different switch architectures, identifying their strongest and weakest points, and thus taking advantage of the best features from both of them. After this first analysis, we introduce several other features in the switch, concluding with a proposed architecture that doubles network throughput while reducing response delay.",
	address = "Kyongju, Korea, Republic of",
	journal = "Proceedings of the Internatoinal Conference on Parallel and Distributed Systems - ICPADS",
	key = "Client server computer systems",
	keywords = "Computer architecture;Computer networks;Data storage equipment;Network protocols;",
	note = "Fiber channel;Storage area network;",
	pages = "484 - 491",
	title = "{O}n the switch architecture for fibre channel storage area networks",
	url = "http://dx.doi.org/10.1109/ICPADS.2001.934857",
	year = 2001
}

J Fernandez, J M Garcia and Jose Duato. Performance evaluation of real-time communication services on high-speed LANs under topology changes. 2001, 341 - 50. BibTeX

@conference{ 7307079,
	author = "J. Fernandez and J.M. Garcia and Duato, Jose",
	abstract = "Topology changes, such as switches being turned on/off, hot expansion, hot replacement or link re-mapping, are very likely to occur in NOWs and clusters. Moreover, topology changes are much more frequent than faults. However, their impact on real-time communications has not been considered a major problem up to now, mostly because they are not feasible in traditional environments, such as massive parallel processors (MPPs), which have fixed topologies. Topology changes are supported and handled by some current and future interconnects, such as Myrinet or Infiniband. Unfortunately, they do not include support for real-time communications in the presence of topology changes. In this paper, we evaluate a previously proposed protocol, called Dynamically Re-established Real-Time Channels (DRRTC) protocol, that provides topology change- and fault-tolerant real-time communication services on NOWs. We present and analyze the performance evaluation results when a single switch or a single link is deactivated/activated for different topologies and workloads. The simulation results suggest that topology change tolerance is only limited by the resources available to establish real-time channels as well as by the topology connectivity",
	address = "Berlin, Germany",
	journal = "High Performance Computing - HiPC 2001. 8th International Conference. Proceedings (Lecture Notes in Computer Science Vol.2238)",
	keywords = "performance evaluation;protocols;workstation clusters;",
	note = "NOWs;clusters;protocol;Dynamically Re-established Real-Time Channels;DRRTQ;fault-tolerant;topology change;real-time communication;topology connectivity;networks of workstations;",
	pages = "341 - 50",
	title = "{P}erformance evaluation of real-time communication services on high-speed {LAN}s under topology changes",
	year = 2001
}

J M Orduna, Federico Silla and Jose Duato. Towards a communication-aware task scheduling strategy for heterogeneous systems. Computing and Informatics 20(3):245 - 67, 2001. BibTeX

@article{ 7109983,
	author = "J.M. Orduna and Silla, Federico and Duato, Jose",
	abstract = "Many research activities have focused on the problem of task scheduling in heterogeneous systems from the computational point of view. However, a scheduling strategy should also take into account the communication requirements of the applications and the communication bandwidth offered by the network. Towards this end, we first propose a model of communication cost between network nodes. This model can be used to properly characterize the existing network resources. Second, we propose a criterion to measure the suitability of each allocation of network resources to each parallel application, according to the communication requirements. Third, we propose a scheduling technique based exclusively on this criterion that provides a near-optimal mapping of processes to processors according to the communication requirements. Evaluation results show that the use of this scheduling technique fully exploits the available network bandwidth, greatly improving network performance. Therefore, the proposed scheduling technique can be used in the design of communication-aware scheduling strategies for those situations where the communication requirements are the system performance bottleneck",
	address = "Slovakia",
	issn = "0232-0274",
	journal = "Computing and Informatics",
	keywords = "directed graphs;performance evaluation;processor scheduling;resource allocation;trees (mathematics);workstation clusters;",
	note = "communication-aware task scheduling strategy;heterogeneous systems;communication cost;network nodes;network resources;parallel application;near-optimal mapping;available network bandwidth;network performance;performance bottleneck;interconnection networks;cluster computing;",
	number = 3,
	pages = "245 - 67",
	title = "{T}owards a communication-aware task scheduling strategy for heterogeneous systems",
	volume = 20,
	year = 2001
}

F Buendia, P Diaz, Julio Sahuquillo, J V Benlloch, J A Gil and M Agusti. XEDU, a framework for developing XML-based didactic resources. In Euromicro Conference, 2001. Proceedings. 27th. 2001, 427 -434. URL, DOI BibTeX

@conference{ 952484,
	author = "F. Buendia and P. Diaz and Sahuquillo, Julio and J.V. Benlloch and J.A. Gil and M. Agusti",
	abstract = "Recent educational software applications use Web technologies like XML to improve teaching methods in distance learning environments. Though XML has already been used to implement a high number of didactic resources, specification methodologies to develop these resources are rarely applied. As a consequence, the reuse and maintenance of those resources becomes a difficult task. This paper emphasises the use of hypermedia models to deal with this problem. Hypermedia models have long considered to have a great potential to represent educational applications. The current work proposes the XEDU framework that works over the Labyrinth hypermedia model, to manage and organise didactic resources. The proposed framework provides a set of abstract didactic structures and the interface to associate them either to XML-based contents and other complex didactic resources",
	booktitle = "Euromicro Conference, 2001. Proceedings. 27th",
	doi = "10.1109/EURMIC.2001.952484",
	isbn = "0-7695-1236-4",
	keywords = "Labyrinth hypermedia model;Web technologies;XEDU;XML-based contents;XML-based didactic resources;abstract didactic structures;complex didactic resources;distance learning environments;educational software;framework;hypermedia models;specification methodol",
	pages = "427 -434",
	title = "{XEDU}, a framework for developing {XML}-based didactic resources",
	url = "http://dx.doi.org/10.1109/EURMIC.2001.952484",
	year = 2001
}