1. Carlos Reaño and Federico Silla. Redesigning the rCUDA communication layer for a better adaptation to the underlying hardware. Concurrency and computation 33(14), 2021. BibTeX

    @article{ReañoCarlos2021Rtrc,
    	abstract = "Summary The use of Graphics Processing Units (GPUs) has become a very popular way to accelerate the execution of many applications. However, GPUs are not exempt from side effects. For instance, GPUs are expensive devices which additionally consume a non‐negligible amount of energy even when they are not performing any computation. Furthermore, most applications present low GPU utilization. To address these concerns, the use of GPU virtualization has been proposed. In particular, remote GPU virtualization is a promising technology that allows applications to transparently leverage GPUs installed in any node of the cluster. In this paper, the remote GPU virtualization mechanism is comparatively analyzed across three different generations of GPUs. The first contribution of this study is an analysis about how the performance of the remote GPU virtualization technique is impacted by the underlying hardware. To that end, the Tesla K20, Tesla K40, and Tesla P100 GPUs along with FDR and EDR InfiniBand fabrics are used in the study. The analysis is performed in the context of the rCUDA middleware. It is clearly shown that the GPU virtualization middleware requires a comprehensive design of its communication layer, which should be perfectly adapted to every hardware generation in order to avoid a reduction in performance. This is precisely the second contribution of this work, ie, redesigning the rCUDA communication layer in order to improve the management of the underlying hardware. Results show that it is possible to improve bandwidth up to 29.43%, which translates into up to 4.81% average less execution time in the performance of the analyzed applications.",
    	author = "Reaño, Carlos and Silla, Federico",
    	copyright = "2019 John Wiley & Sons, Ltd.",
    	issn = "1532-0626",
    	journal = "Concurrency and computation",
    	keywords = "CUDA ; GPGPU ; HPC ; infiniBand ; virtualization",
    	language = "eng",
    	number = 14,
    	publisher = "Wiley Subscription Services, Inc",
    	title = "Redesigning the rCUDA communication layer for a better adaptation to the underlying hardware",
    	volume = 33,
    	year = 2021
    }
    
  2. Sergio Iserte, Javier Prades, Carlos Reaño and Federico Silla. Improving the management efficiency of GPU workloads in data centers through GPU virtualization. Concurrency and computation 33(2), 2021. BibTeX

    @article{IserteSergio2021Itme,
    	abstract = "Summary Graphics processing units (GPUs) are currently used in data centers to reduce the execution time of compute‐intensive applications. However, the use of GPUs presents several side effects, such as increased acquisition costs and larger space requirements. Furthermore, GPUs require a nonnegligible amount of energy even while idle. Additionally, GPU utilization is usually low for most applications. In a similar way to the use of virtual machines, using virtual GPUs may address the concerns associated with the use of these devices. In this regard, the remote GPU virtualization mechanism could be leveraged to share the GPUs present in the computing facility among the nodes of the cluster. This would increase overall GPU utilization, thus reducing the negative impact of the increased costs mentioned before. Reducing the amount of GPUs installed in the cluster could also be possible. However, in the same way as job schedulers map GPU resources to applications, virtual GPUs should also be scheduled before job execution. Nevertheless, current job schedulers are not able to deal with virtual GPUs. In this paper, we analyze the performance attained by a cluster using the remote Compute Unified Device Architecture middleware and a modified version of the Slurm scheduler, which is now able to assign remote GPUs to jobs. Results show that cluster throughput, measured as jobs completed per time unit, is doubled at the same time that the total energy consumption is reduced up to 40%. GPU utilization is also increased.",
    	author = "Iserte, Sergio and Prades, Javier and Reaño, Carlos and Silla, Federico",
    	copyright = "2019 John Wiley & Sons, Ltd.",
    	issn = "1532-0626",
    	journal = "Concurrency and computation",
    	keywords = "CUDA ; data centers ; GPU ; InfiniBand ; rCUDA ; Slurm ; Virtualization",
    	language = "eng",
    	number = 2,
    	publisher = "Wiley Subscription Services, Inc",
    	title = "Improving the management efficiency of GPU workloads in data centers through GPU virtualization",
    	volume = 33,
    	year = 2021
    }
    
  3. Carlos Reaño, Federico Silla and Blesson Varghese. PII: S0743-7315(20)30386-5. Journal of parallel and distributed computing 147:268-269, 2021. BibTeX

    @article{ReañoCarlos2021PS,
    	author = "Reaño, Carlos and Silla, Federico and Varghese, Blesson",
    	copyright = "2020 Elsevier Inc.",
    	issn = "0743-7315",
    	journal = "Journal of parallel and distributed computing",
    	language = "eng",
    	pages = "268-269",
    	publisher = "Elsevier Inc",
    	title = "PII: S0743-7315(20)30386-5",
    	volume = 147,
    	year = 2021
    }
    
  4. Carlos Reaño, Federico Silla and Blesson Varghese. Accelerator virtualization. Concurrency and computation, 2021. BibTeX

    @article{ReañoCarlos2021Av,
    	author = "Reaño, Carlos and Silla, Federico and Varghese, Blesson",
    	issn = "1532-0626",
    	journal = "Concurrency and computation",
    	language = "eng",
    	title = "Accelerator virtualization",
    	year = 2021
    }
    
  5. Daniel Hernandez, Juan-Carlos Cano, Federico Silla, Carlos T Calafate and Jose M Cecilia. AI-enabled autonomous drones for fast climate change crisis assessment. IEEE internet of things journal, pages 1-1, 2021. BibTeX

    @article{HernandezDaniel2021Aadf,
    	abstract = "Climate change is one of the greatest challenges for modern societies. Its consequences, often associated with extreme events, have dramatic results worldwide. New synergies between different disciplines including Artificial Intelligence (AI), Internet of Things (IoT), and edge computing can lead to radically new approaches for the real-time tracking of natural disasters that are also designed to reduce the environmental footprint. In this article, we propose an AI-based pipeline for processing natural disaster images taken from drones. The purpose of this pipeline is to reduce the number of images to be processed by the first responders of the natural disaster. It consists of three main stages, (1) a lightweight auto-encoder based on deep learning, (2) a dimensionality reduction using the t-SNE algorithm and (3) a fuzzy clustering procedure. This pipeline is evaluated on several edge computing platforms with low-power accelerators to assess the design of intelligent autonomous drones to provide this service in real time. Our experimental evaluation focuses on flooding, showing that the amount of information to be processed is substantially reduced whereas edge computing platforms with low-power GPUs are placed as a compelling alternative for processing these heavy computational workloads, obtaining a performance loss of only 2.3x compared to its cloud counterpart version, running both the training and inference steps.",
    	author = "Hernandez, Daniel and Cano, Juan-Carlos and Silla, Federico and Calafate, Carlos T and Cecilia, Jose M",
    	issn = "2327-4662",
    	journal = "IEEE internet of things journal",
    	keywords = "Artificial Vision ; Climate Change ; Cloud computing ; Clustering algorithms ; Deep Learning ; Drones ; Edge computing ; Internet of Things ; Performance evaluation ; Pipelines ; Sustainable ICT ; UAVs",
    	language = "eng",
    	pages = "1-1",
    	publisher = "IEEE",
    	title = "AI-enabled autonomous drones for fast climate change crisis assessment",
    	year = 2021
    }
    
  6. Javier Prades Gasulla. Improving Performance and Energy Efficiency of Heterogeneous Systems with rCUDA. 2021. BibTeX

    @misc{PradesGasullaJavier2021IPaE,
    	abstract = "[ES] En la última década la utilización de la GPGPU (General Purpose computing in Graphics Processing Units; Computación de Propósito General en Unidades de Procesamiento Gráfico) se ha vuelto tremendamente popular en los centros de datos de todo el mundo. Las GPUs (Graphics Processing Units; Unidades de Procesamiento Gráfico) se han establecido como elementos aceleradores de cómputo que son usados junto a las CPUs formando sistemas heterogéneos. La naturaleza masivamente paralela de las GPUs, destinadas tradicionalmente al cómputo de gráficos, permite realizar operaciones numéricas con matrices de datos a gran velocidad debido al gran número de núcleos que integran y al gran ancho de banda de acceso a memoria que poseen. En consecuencia, aplicaciones de todo tipo de campos, tales como química, física, ingeniería, inteligencia artificial, ciencia de materiales, etc. que presentan este tipo de patrones de cómputo se ven beneficiadas, reduciendo drásticamente su tiempo de ejecución. En general, el uso de la aceleración del cómputo en GPUs ha significado un paso adelante y una revolución. Sin embargo, no está exento de problemas, tales como problemas de eficiencia energética, baja utilización de las GPUs, altos costes de adquisición y mantenimiento, etc. En esta tesis pretendemos analizar las principales carencias que presentan estos sistemas heterogéneos y proponer soluciones basadas en el uso de la virtualización remota de GPUs. Para ello hemos utilizado la herramienta rCUDA, desarrollada en la Universitat Politècnica de València, ya que multitud de publicaciones la avalan como el framework de virtualización remota de GPUs más avanzado de la actualidad. Los resutados obtenidos en esta tesis muestran que el uso de rCUDA en entornos de Cloud Computing incrementa el grado de libertad del sistema, ya que permite crear instancias virtuales de las GPUs físicas totalmente a medida de las necesidades de cada una de las máquinas virtuales. En entornos HPC (High Performance Computing; Computación de Altas Prestaciones), rCUDA también proporciona un mayor grado de flexibilidad de uso de las GPUs de todo el clúster de cómputo, ya que permite desacoplar totalmente la parte CPU de la parte GPU de las aplicaciones. Además, las GPUs pueden estar en cualquier nodo del clúster, independientemente del nodo en el que se está ejecutando la parte CPU de la aplicación. En general, tanto para Cloud Computing como en el caso de HPC, este mayor grado de flexibilidad se traduce en un aumento hasta 2x de la productividad de todo el sistema al mismo tiempo que se reduce el consumo energético en un 15%. Finalmente, también hemos desarrollado un mecanismo de migración de trabajos de la parte GPU de las aplicaciones que ha sido integrado dentro del framework rCUDA. Este mecanismo de migración ha sido evaluado y los resultados muestran claramente que, a cambio de una pequeña sobrecarga, alrededor de 400 milisegundos, en el tiempo de ejecución de las aplicaciones, es una potente herramienta con la que, de nuevo, aumentar la productividad y reducir el gasto energético del sistema. En resumen, en esta tesis se analizan los principales problemas derivados del uso de las GPUs como aceleradores de cómputo, tanto en entornos HPC como de Cloud Computing, y se demuestra cómo a través del uso del framework rCUDA, estos problemas pueden solucionarse. Además se desarrolla un potente mecanismo de migración de trabajos GPU, que integrado dentro del framework rCUDA, se convierte en una herramienta clave para los futuros planificadores de trabajos en clusters heterogéneos. [CA] En l'última dècada la utilització de la GPGPU(General Purpose computing in Graphics Processing Units; Computació de Propòsit General en Unitats de Processament Gràfic) s'ha tornat extremadament popular en els centres de dades de tot el món. Les GPUs (Graphics Processing Units; Unitats de Processament Gràfic) s'han establert com a elements acceleradors de còmput que s'utilitzen al costat de les CPUs formant sistemes heterogenis. La naturalesa massivament paral·lela de les GPUs, destinades tradicionalment al còmput de gràfics, permet realitzar operacions numèriques amb matrius de dades a gran velocitat degut al gran nombre de nuclis que integren i al gran ample de banda d'accés a memòria que posseeixen. En conseqüència, les aplicacions de tot tipus de camps, com ara química, física, enginyeria, intel·ligència artificial, ciència de materials, etc. que presenten aquest tipus de patrons de còmput es veuen beneficiades reduint dràsticament el seu temps d'execució. En general, l'ús de l'acceleració del còmput en GPUs ha significat un pas endavant i una revolució, però no està exempt de problemes, com ara poden ser problemes d'eficiència energètica, baixa utilització de les GPUs, alts costos d'adquisició i manteniment, etc. En aquesta tesi pretenem analitzar les principals mancances que presenten aquests sistemes heterogenis i proposar solucions basades en l'ús de la virtualització remota de GPUs. Per a això hem utilitzat l'eina rCUDA, desenvolupada a la Universitat Politècnica de València, ja que multitud de publicacions l'avalen com el framework de virtualització remota de GPUs més avançat de l'actualitat. Els resultats obtinguts en aquesta tesi mostren que l'ús de rCUDA en entorns de Cloud Computing incrementa el grau de llibertat del sistema, ja que permet crear instàncies virtuals de les GPUs físiques totalment a mida de les necessitats de cadascuna de les màquines virtuals. En entorns HPC (High Performance Computing; Computació d'Altes Prestacions), rCUDA també proporciona un major grau de flexibilitat en l'ús de les GPUs de tot el clúster de còmput, ja que permet desacoblar totalment la part CPU de la part GPU de les aplicacions. A més, les GPUs poden estar en qualsevol node del clúster, sense importar el node en el qual s'està executant la part CPU de l'aplicació. En general, tant per a Cloud Computing com en el cas del HPC, aquest major grau de flexibilitat es tradueix en un augment fins 2x de la productivitat de tot el sistema al mateix temps que es redueix el consum energètic en aproximadament un 15%. Finalment, també hem desenvolupat un mecanisme de migració de treballs de la part GPU de les aplicacions que ha estat integrat dins del framework rCUDA. Aquest mecanisme de migració ha estat avaluat i els resultats mostren clarament que, a canvi d'una petita sobrecàrrega, al voltant de 400 mil·lisegons, en el temps d'execució de les aplicacions, és una potent eina amb la qual, de nou, augmentar la productivitat i reduir la despesa energètica de sistema. En resum, en aquesta tesi s'analitzen els principals problemes derivats de l'ús de les GPUs com acceleradors de còmput, tant en entorns HPC com de Cloud Computing, i es demostra com a través de l'ús del framework rCUDA, aquests problemes poden solucionar-se. A més es desenvolupa un potent mecanisme de migració de treballs GPU, que integrat dins del framework rCUDA, esdevé una eina clau per als futurs planificadors de treballs en clústers heterogenis. [EN] In the last decade the use of GPGPU (General Purpose computing in Graphics Processing Units) has become extremely popular in data centers around the world. GPUs (Graphics Processing Units) have been established as computational accelerators that are used alongside CPUs to form heterogeneous systems. The massively parallel nature of GPUs, traditionally intended for graphics computing, allows to perform numerical operations with data arrays at high speed. This is achieved thanks to the large number of cores GPUs integrate and the large bandwidth of memory access. Consequently, applications of all kinds of fields, such as chemistry, physics, engineering, artificial intelligence, materials science, and so on, presenting this type of computational patterns are benefited by drastically reducing their execution time. In general, the use of computing acceleration provided by GPUs has meant a step forward and a revolution, but it is not without problems, such as energy efficiency problems, low utilization of GPUs, high acquisition and maintenance costs, etc. In this PhD thesis we aim to analyze the main shortcomings of these heterogeneous systems and propose solutions based on the use of remote GPU virtualization. To that end, we have used the rCUDA middleware, developed at Universitat Politècnica de València. Many publications support rCUDA as the most advanced remote GPU virtualization framework nowadays. The results obtained in this PhD thesis show that the use of rCUDA in Cloud Computing environments increases the degree of freedom of the system, as it allows to create virtual instances of the physical GPUs fully tailored to the needs of each of the virtual machines. In HPC (High Performance Computing) environments, rCUDA also provides a greater degree of flexibility in the use of GPUs throughout the computing cluster, as it allows the CPU part to be completely decoupled from the GPU part of the applications. In addition, GPUs can be on any node in the cluster, regardless of the node on which the CPU part of the application is running. In general, both for Cloud Computing and in the case of HPC, this greater degree of flexibility translates into an up to 2x increase in system-wide throughput while reducing energy consumption by approximately 15%. Finally, we have also developed a job migration mechanism for the GPU part of applications that has been integrated within the rCUDA middleware. This migration mechanism has been evaluated and the results clearly show that, in exchange for a small overhead of about 400 milliseconds in the execution time of the applications, it is a powerful tool with which, again, we can increase productivity and reduce energy foot print of the computing system. In summary, this PhD thesis analyzes the main problems arising from the use of GPUs as computing accelerators, both in HPC and Cloud Computing environments, and demonstrates how thanks to the use of the r",
    	author = "Prades Gasulla, Javier",
    	copyright = "http://rightsstatements.org/vocab/InC/1.0/ info:eu-repo/semantics/openAccess",
    	keywords = "ARQUITECTURA Y TECNOLOGIA DE COMPUTADORES ; Cloud Computing ; Computación de altas prestaciones ; Energy Efficiency ; GPGPU ; Graphics processing units (GPU) ; Heterogeneous systems ; High Performance Computing ; HPC ; rCUDA ; Unidades de procesamiento gráfico",
    	language = "eng",
    	publisher = "Universitat Politècnica de València",
    	title = "Improving Performance and Energy Efficiency of Heterogeneous Systems with rCUDA",
    	year = 2021
    }
    
  7. Adrián Castelló, Enrique S Quintana-Ortí and José Duato. Accelerating distributed deep neural network training with pipelined MPI allreduce. Cluster computing 24(4):3797-3813, 2021. BibTeX

    @article{CastellóAdrián2021Addn,
    	abstract = "TensorFlow (TF) is usually combined with the Horovod (HVD) workload distribution package to obtain a parallel tool to train deep neural network on clusters of computers. HVD in turn utilizes a blocking Allreduce primitive to share information among processes, combined with a communication thread to overlap communication with computation. In this work, we perform a thorough experimental analysis to expose (1) the importance of selecting the best algorithm in MPI libraries to realize the Allreduce operation; and (2) the performance acceleration that can be attained when replacing a blocking Allreduce with its non-blocking counterpart (while maintaining the blocking behaviour via the appropriate synchronization mechanism). Furthermore, (3) we explore the benefits of applying pipelining to the communication exchange, demonstrating that these improvements carry over to distributed training via TF+HVD. Finally, (4) we show that pipelining can also boost performance for applications that make heavy use of other collectives, such as Broadcast and Reduce-Scatter.",
    	author = "Castelló, Adrián and Quintana-Ortí, Enrique S and Duato, José",
    	address = "New York",
    	copyright = "The Author(s) 2021. corrected publication 2021",
    	issn = "1386-7857",
    	journal = "Cluster computing",
    	keywords = "Algorithms ; Analysis ; Article ; Artificial neural networks ; Communication ; Computer Communication Networks ; Computer Science ; Neural networks ; Operating Systems ; Processor Architectures ; Synchronism ; Training ; Usage",
    	language = "eng",
    	number = 4,
    	pages = "3797-3813",
    	publisher = "Springer US",
    	title = "Accelerating distributed deep neural network training with pipelined MPI allreduce",
    	volume = 24,
    	year = 2021
    }
    
  8. Cristina Olmedilla, Jesus Escudero-Sahuquillo, Pedro Javier Garcia-Garcia, Francisco Alfaro-Cortes, Jose L Sanchez, Francisco J Quiles, Wenhao Sun, Xiang Yu, Yonghui Xu and Jose Duato. DVL-Lossy: Isolating Congesting Flows to Optimize Packet Dropping in Lossy Data-Center Networks. IEEE MICRO 41(1):37-44, 2021. BibTeX

    @article{OlmedillaCristina2021DICF,
    	abstract = "The performance of lossy data-center networks (DCNs) may degrade due to packet dropping (and possible retransmission) under congestion. In this article, we propose and evaluate a solution to deal with congestion in lossy DCNs, based on the same approach as the dynamic virtual lanes technique, previously proposed for lossless DCNs. This approach consists of isolating congesting flows in special queues, so that they do not share queues with noncongesting ones. This reduces the probability of standard queues becoming congested, thus reducing the dropping (and retransmission) of noncongesting packets and improving network performance. The experiment results confirm that these benefits are achieved by adding just a single special queue per switch port.",
    	author = "Olmedilla, Cristina and Escudero-Sahuquillo, Jesus and Garcia-Garcia, Pedro Javier and Alfaro-Cortes, Francisco and Sanchez, Jose L and Quiles, Francisco J and Sun, Wenhao and Yu, Xiang and Xu, Yonghui and Duato, Jose",
    	issn = "0272-1732",
    	journal = "IEEE MICRO",
    	keywords = "Bandwidth ; Congestion Management ; Delays ; Dynamic Virtual Lanes ; Lossy Data-center Networks ; Packet loss ; Standards organizations ; Switches ; Topology",
    	language = "eng",
    	number = 1,
    	pages = "37-44",
    	publisher = "IEEE",
    	title = "DVL-Lossy: Isolating Congesting Flows to Optimize Packet Dropping in Lossy Data-Center Networks",
    	volume = 41,
    	year = 2021
    }
    
  9. Adrian Castello, Mar Catalan, Manuel F Dolz, Jose I Mestre, Enrique S Quintana-Orti and Jose Duato. Performance Modeling for Distributed Training of Convolutional Neural Networks. In 2021 29th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP). 2021, 99-108. BibTeX

    @inproceedings{CastelloAdrian2021PMfD,
    	abstract = "We perform a theoretical analysis comparing the scalability of data versus model parallelism, applied to the distributed training of deep convolutional neural networks (CNNs), along five axes: batch size, node (floating-point) arithmetic performance, node memory bandwidth, network link bandwidth, and cluster dimension. Our study relies on analytical performance models that can be configured to reproduce the components and organization of the CNN model as well as the hardware configuration of the target distributed platform. In addition, we provide evidence of the accuracy of the analytical models by performing a validation against a Python library for distributed deep learning training.",
    	author = "Castello, Adrian and Catalan, Mar and Dolz, Manuel F and Mestre, Jose I and Quintana-Orti, Enrique S and Duato, Jose",
    	booktitle = "2021 29th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)",
    	isbn = 9781665414555,
    	issn = "2377-5750",
    	keywords = "analytical modeling ; Analytical models ; Bandwidth ; clusters ; Deep neural networks (DNNs) ; distributed training ; Neural networks ; Organizations ; Parallel processing ; Scalability ; Training",
    	language = "eng",
    	pages = "99-108",
    	publisher = "IEEE",
    	title = "Performance Modeling for Distributed Training of Convolutional Neural Networks",
    	year = 2021
    }
    
  10. Adrian Castello, Mar Catalan, Manuel F Dolz, Jose I Mestre, Enrique S Quintana-Orti and Jose Duato. Evaluation of MPI Allreduce for Distributed Training of Convolutional Neural Networks. In 2021 29th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP). 2021, 109-116. BibTeX

    @inproceedings{CastelloAdrian2021EoMA,
    	abstract = "Training deep neural networks is a costly procedure, often performed via sophisticated deep learning frameworks on clusters of computers. As faster processor technologies are integrated into these cluster facilities (e.g., NVIDIA's graphics accelerators or Google's tensor processing units), the communication component of the training process rapidly becomes a performance bottleneck. In this paper, we offer a complete analysis of the key collective communication primitive for the distributed data-parallel training of convolutional network networks (CNNs) focused on three relevant instances of the Message Passing Interface (MPI): MPICH, OpenMPI, and IntelMPI. In addition, our experimental evaluation is extended to expose the practical impact of this collective primitive when the training is performed using TensorFlow+ Horovod on a 16-node cluster. Finally, the theoretical analysis is further refined to a number of accelerated cluster configurations that are emulated by adjusting the communication-arithmetic ratio of the training process.",
    	author = "Castello, Adrian and Catalan, Mar and Dolz, Manuel F and Mestre, Jose I and Quintana-Orti, Enrique S and Duato, Jose",
    	booktitle = "2021 29th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)",
    	isbn = 9781665414555,
    	issn = "2377-5750",
    	keywords = "Allreduce ; collective communication primitives ; Convolutional neural networks ; Deep learning ; distributed training ; Graphics ; Message passing ; Message Passing Interface (MPI) ; Neural networks ; Tensors ; Training",
    	language = "eng",
    	pages = "109-116",
    	publisher = "IEEE",
    	title = "Evaluation of MPI Allreduce for Distributed Training of Convolutional Neural Networks",
    	year = 2021
    }
    
  11. Li Shen, Wenhao Sun, Xiang Yu and José Duato. Packet Control Method, Flow Table Update Method, and Node Device. 2021. BibTeX

    @misc{ShenLi2021PCMF,
    	abstract = "A packet control method, a flow table update method, and a node device including a first queue and a second queue, where the method includes: obtaining, by the node device, a first packet; determining, by the node device, that a data flow to which the first packet belongs is marked as an isolated flow; and if the first queue and/or the second queue meet and/or meets a first preset condition, controlling, by the node device, the first packet to enter the first queue and wait to be scheduled; or if the first queue and/or the second queue meet and/or meets a second preset condition, controlling, by the node device, the first packet to enter the second queue and wait to be scheduled.",
    	author = "Shen, Li and Sun, Wenhao and Yu, Xiang and Duato, José",
    	keywords = "ELECTRIC COMMUNICATION TECHNIQUE ; ELECTRICITY ; TRANSMISSION OF DIGITAL INFORMATION, e.g. TELEGRAPHICCOMMUNICATION",
    	language = "eng",
    	title = "Packet Control Method, Flow Table Update Method, and Node Device",
    	year = 2021
    }
    
  12. Xiang YU, Wenhao SUN, José DUATO and Li SHEN. PACKET CONTROL METHOD AND NODE DEVICE. 2021. BibTeX

    @misc{YUXiang2021PCMA,
    	abstract = "The present invention discloses a packet control method and a node device, to improve reliability of a data flow in a transmission process. The method includes: After receiving a pause frame, a first node automatically applies, based on adjustment information that is of a send queue of a data flow and that is recorded in a state record set, the pause frame to all queues associated in an adjustment process of the send queue of the data flow. In this way, a packet loss problem in a data transmission process can be avoided without adjusting an XOFF/XON threshold of a receive queue and without increasing a quantity of pause frames in a network system, thereby improving reliability of the data flow in the transmission process.",
    	author = "YU, Xiang and SUN, Wenhao and DUATO, José and SHEN, Li",
    	keywords = "ELECTRIC COMMUNICATION TECHNIQUE ; ELECTRICITY ; TRANSMISSION OF DIGITAL INFORMATION, e.g. TELEGRAPHICCOMMUNICATION",
    	language = "eng",
    	title = "PACKET CONTROL METHOD AND NODE DEVICE",
    	year = 2021
    }
    
  13. Xiang YU, Wenhao SUN, José DUATO and Li SHEN. MESSAGE CONTROL METHOD AND NODE DEVICE. 2021. BibTeX

    @misc{YUXiang2021MCMA,
    	abstract = "The present invention discloses a packet control method and a node device, to improve reliability of a data flow in a transmission process. The method includes: After receiving a pause frame, a first node automatically applies, based on adjustment information that is of a send queue of a data flow and that is recorded in a state record set, the pause frame to all queues associated in an adjustment process of the send queue of the data flow. In this way, a packet loss problem in a data transmission process can be avoided without adjusting an XOFF/XON threshold of a receive queue and without increasing a quantity of pause frames in a network system, thereby improving reliability of the data flow in the transmission process.",
    	author = "YU, Xiang and SUN, Wenhao and DUATO, José and SHEN, Li",
    	keywords = "ELECTRIC COMMUNICATION TECHNIQUE ; ELECTRICITY ; TRANSMISSION OF DIGITAL INFORMATION, e.g. TELEGRAPHICCOMMUNICATION",
    	language = "eng ; fre ; ger",
    	title = "MESSAGE CONTROL METHOD AND NODE DEVICE",
    	year = 2021
    }
    
  14. Xiang YU, Wenhao SUN, José DUATO and Li SHEN. MESSAGE CONTROL METHOD, FLOW TABLE UPDATING METHOD, AND NODE DEVICE. 2021. BibTeX

    @misc{YUXiang2021MCMF,
    	abstract = "A packet control method, a flow table update method, and a node device are provided. The node device includes a first queue and a second queue. The method includes: obtaining, by the node device, a first packet; determining, by the node device, that a data flow to which the first packet belongs is marked as an isolated flow; and if the first queue and/or the second queue meet and/or meets a first preset condition, controlling, by the node device, the first packet to enter the first queue and wait to be scheduled, or if the first queue and/or the second queue meet and/or meets a second preset condition, controlling, by the node device, the first packet to enter the second queue and wait to be scheduled. In this way, the node device can relatively flexibly control a packet of the data flow, marked as an isolated flow, to enter a queue.",
    	author = "YU, Xiang and SUN, Wenhao and DUATO, José and SHEN, Li",
    	keywords = "ELECTRIC COMMUNICATION TECHNIQUE ; ELECTRICITY ; TRANSMISSION OF DIGITAL INFORMATION, e.g. TELEGRAPHICCOMMUNICATION",
    	language = "eng ; fre ; ger",
    	title = "MESSAGE CONTROL METHOD, FLOW TABLE UPDATING METHOD, AND NODE DEVICE",
    	year = 2021
    }
    
  15. Tomas Picornell, Jose Flich, Carles Hernandez and Jose Duato. Enforcing Predictability of Many-Cores With DCFNoC. IEEE transactions on computers 70(2):270-283, 2021. BibTeX

    @article{PicornellTomas2021EPoM,
    	abstract = {The ever need for higher performance forces industry to include technology based on multi-processors system on chip (MPSoCs) in their safety-critical embedded systems. MPSoCs include a network-on-chip (NoC) to interconnect the cores between them and with memory and the rest of shared resources. Unfortunately, the inclusion of NoCs compromises guaranteeing time predictability as network-level conflicts may occur. To overcome this problem, in this article we propose DCFNoC, a new time-predictable NoC design paradigm where conflicts within the network are eliminated by design. This new paradigm builds on top of the Channel Dependency Graph (CDG) in order to deterministically avoid network conflicts. The network guarantees predictability to applications and is able to naturally inject messages using a TDM period equal to the optimal theoretical bound without the need of using a computationally demanding offline process. DCFNoC is integrated in a tile-based many-core system and adapted to its memory hierarchy. Our results show that DCFNoC guarantees time predictability avoiding network interference among multiple running applications. DCFNoC always guarantees performance and also improves wormhole performance in a 4\times 4 4×4  setting by a factor of 3.7\times 3.7×  when interference traffic is injected. For a 8\times 8 8×8  network differences are even larger. In addition, DCFNoC obtains a total area saving of 10.79 percent over a standard wormhole implementation.},
    	author = "Picornell, Tomas and Flich, Jose and Hernandez, Carles and Duato, Jose",
    	issn = "0018-9340",
    	journal = "IEEE transactions on computers",
    	keywords = "Delays ; Electronic mail ; Interference ; MPSoCs ; Multiprocessor interconnection ; Real-time systems ; Routing ; safety-critical systems ; Software ; Time division multiplexing ; time division multiplexing (TDM) ; time predictable network",
    	language = "eng",
    	number = 2,
    	pages = "270-283",
    	publisher = "IEEE",
    	title = "Enforcing Predictability of Many-Cores With DCFNoC",
    	volume = 70,
    	year = 2021
    }
    
  16. Giovanni Agosta, William Fornaciari, David Atienza, Ramon Canal, Alessandro Cilardo, José Flich Cardo, Carles Hernandez Luz, Michal Kulczewski, Giuseppe Massari, Rafael Tornero Gavilá and Marina Zapater. The RECIPE approach to challenges in deeply heterogeneous high performance systems. Microprocessors and microsystems 77:103185-, 2020. BibTeX

    @article{AgostaGiovanni2020TRat,
    	abstract = "RECIPE (REliable power and time-ConstraInts-aware Predictive management of heterogeneous Exascale systems) is a recently started project funded within the H2020 FETHPC programme, which is expressly targeted at exploring new High-Performance Computing (HPC) technologies. RECIPE aims at introducing a hierarchical runtime resource management infrastructure to optimize energy efficiency and minimize the occurrence of thermal hotspots, while enforcing the time constraints imposed by the applications and ensuring reliability for both time-critical and throughput-oriented computation that run on deeply heterogeneous accelerator-based systems. This paper presents a detailed overview of RECIPE, identifying the fundamental challenges as well as the key innovations addressed by the project. In particular, the need for predictive reliability approaches to maximizing hardware lifetime and guarantee application performance is identified as the key concern for RECIPE. We address it through hierarchical resource management of the heterogeneous architectural components of the system, driven by estimates of the application latency and hardware reliability obtained respectively through timing analysis and modeling thermal properties and mean-time-to-failure of subsystems. We show the impact of prediction accuracy on the overheads imposed by the checkpointing policy, as well as a possible application to a weather forecasting use case.",
    	author = "Agosta, Giovanni and Fornaciari, William and Atienza, David and Canal, Ramon and Cilardo, Alessandro and Flich Cardo, José and Hernandez Luz, Carles and Kulczewski, Michal and Massari, Giuseppe and Tornero Gavilá, Rafael and Zapater, Marina",
    	copyright = "2020 Elsevier B.V.",
    	issn = "0141-9331",
    	journal = "Microprocessors and microsystems",
    	keywords = "Computer Science - Distributed, Parallel, and Cluster Computing ; Heterogeneous computing ; HPC ; Run-time management",
    	language = "eng",
    	pages = "103185-",
    	publisher = "Elsevier B.V",
    	title = "The RECIPE approach to challenges in deeply heterogeneous high performance systems",
    	volume = 77,
    	year = 2020
    }
    
  17. Juan-José Crespo, José L Sánchez, Francisco J Alfaro-Cortés, José Flich and José Duato. UPR: deadlock-free dynamic network reconfiguration by exploiting channel dependency graph compatibility. The Journal of supercomputing 77(11):12826-12856, 2021. BibTeX

    @article{CrespoJuan-José2021Uddn,
    	abstract = "Deadlock-free dynamic network reconfiguration process is usually studied from the routing algorithm restrictions and resource reservation perspective. The dynamic nature yielded by the transition process from one routing function to another is often managed by restricting resource usage in a static predefined manner, which often limits the supported routing algorithms and/or inactive link patterns, or either requires additional resources such as virtual channels. Exploiting compatibility between routing functions by exploring their associated channel dependency graphs (CDG) leads to a better reconfiguration process given its dynamic nature. In this paper, we propose a new dynamic reconfiguration process called Upstream Progressive Reconfiguration (UPR). Our algorithm progressively performs dependency addition/removal in a per channel basis relying on the information provided by the CDG, while the reconfiguration process takes place. This gives us the opportunity to foresee compatible scenarios where both routing functions coexist, reducing the needed amount of resource drainage as well as packet injection halting.",
    	author = "Crespo, Juan-José and Sánchez, José L and Alfaro-Cortés, Francisco J and Flich, José and Duato, José",
    	address = "New York",
    	copyright = "The Author(s), under exclusive licence to Springer Science+Business Media, LLC, part of Springer Nature 2021",
    	issn = "0920-8542",
    	journal = "The Journal of supercomputing",
    	keywords = "Algorithms ; Article ; Compatibility ; Compilers ; Computer Science ; Computer Science - Networking and Internet Architecture ; general ; Interpreters ; Processor Architectures ; Programming Languages ; Reconfiguration",
    	language = "eng",
    	number = 11,
    	pages = "12826-12856",
    	publisher = "Springer US",
    	title = "UPR: deadlock-free dynamic network reconfiguration by exploiting channel dependency graph compatibility",
    	volume = 77,
    	year = 2021
    }
    
  18. Tomás Picornell-Sanjuan, José Flich Cardo, Carles Hernández Luz and José Francisco Duato Marín. Enforcing Predictability of Many-cores with DCFNoC. , 2021. BibTeX

    @article{Picornell-SanjuanTomás2021EPoM,
    	abstract = "© 2021 IEEE. Personal use of this material is permitted. Permissíon from IEEE must be obtained for all other uses, in any current or future media, including reprinting/republishing this material for advertisíng or promotional purposes, creating new collective works, for resale or redistribution to servers or lists, or reuse of any copyrighted component of this work in other works. [EN] The ever need for higher performance forces industry to include technology based on multi-processors system on chip (MPSoCs) in their safety-critical embedded systems. MPSoCs include a network-on-chip (NoC) to interconnect the cores between them and with memory and the rest of shared resources. Unfortunately, the inclusion of NoCs compromises guaranteeing time predictability as network-level conflicts may occur. To overcome this problem, in this paper we propose DCFNoC, a new time-predictable NoC design paradigm where conflicts within the network are eliminated by design. This new paradigm builds on top of the Channel Dependency Graph (CDG) in order to deterministically avoid network conflicts. The network guarantees predictability to applications and is able to naturally inject messages using a TDM period equal to the optimal theoretical bound without the need of using a computationally demanding offline process. DCFNoC is integrated in a tile-based many-core system and adapted to its memory hierarchy. Our results show that DCFNoC guarantees time predictability avoiding network interference among multiple running applications. DCFNoC always guarantees performance and also improves wormhole performance in a 4 × 4 setting by a factor of 3.7× when interference traffic is injected. For a 8 × 8 network differences are even larger. In addition, DCFNoC obtains a total area saving of 10.79% over a standard wormhole implementation. This work has been supported by MINECO under Grant BES-2016-076885, by MINECO and funds from the European ERDF under Grant TIN2015-66972-C05-1-R and Grant RTI2018-098156-B-C51, and by the EC H2020 RECIPE project under Grant 801137. Picornell-Sanjuan, T.; Flich Cardo, J.; Hernández Luz, C.; Duato Marín, JF. (2021). Enforcing Predictability of Many-cores with DCFNoC. IEEE Transactions on Computers. 70(2):270-283. https://doi.org/10.1109/TC.2020.2987797",
    	author = "Picornell-Sanjuan, Tomás and Flich Cardo, José and Hernández Luz, Carles and Duato Marín, José Francisco",
    	copyright = "http://rightsstatements.org/vocab/InC/1.0/ info:eu-repo/semantics/openAccess",
    	keywords = "ARQUITECTURA Y TECNOLOGIA DE COMPUTADORES ; MPSoCs ; Real-time systems ; Safety-critical systems ; Time division multiplexing (TDM) ; Time predictable network",
    	language = "eng",
    	publisher = "Institute of Electrical and Electronics Engineers",
    	title = "Enforcing Predictability of Many-cores with DCFNoC",
    	year = 2021
    }
    
  19. Brian Miguel Mcmullen García. Deepwise Separable Convolution Support in Neural Network Platform. 2021. BibTeX

    @misc{McmullenGarcíaBrianMiguel2021DSCS,
    	abstract = {[EN] In the last few years, Artificial intelligence (AI), has become an essential element of many technological fields. While AI has been developing on the level of algorithms, processing architectures have also been developing to better support AI. To obtain a better understanding of the implications of these architectures on AI algorithms, it is indispensable to use the new tools that allow the appropriate exploration, and consequently, the development of optimum algorithms and architectures adapted to the particular needs of the given problem to be solved by AI. This project will develop all the support necessary to enable the use of the Deepwise Separable (DWS) convolution on a training and inference platform for AI. The convolution will be implemented on a system based on an FPGA, an Alveo by Xilinx, using High-Level Synthesis. To demonstrate the advantages that our implementation of the convolution provides us, we will compare our implementations with that of a direct convolution. [ES] En los últimos años la Inteligencia Artificial (AI) se está convirtiendo en un elemento imprescindible en múltiples ámbitos tecnológicos. Al mismo tiempo que la IA se está desarrollando a nivel de algoritmos, también las arquitecturas de procesamiento se están adaptando para un mejor soporte de la IA. Para un mejor conocimiento de las implicaciones de las arquitecturas con los algoritmos de IA se hace imprescindible el uso de nuevas herramientas que permitan una exploración adecuada y, por consiguiente, un desarrollo de algoritmos y arquitecturas óptimos y adaptados a las necesidades particulares del problema a resolver por la IA. En este proyecto se desarrollará todo el soporte para habilitar el uso de la convolución DeepWise Separable en una plataforma de entrenamiento e inferencia de procesos de IA. La convolución se implementará en un sistema basado en una FPGA, la ALVEO de Xilinx utilizando High-Level Synthesis. Para demostrar las ventajas que nos proporciona nuestra implementación de la convolución, compararemos nuestra implementación con la implementación de una convolución directa. [CA] En els últims anys, la intel.ligència artificial (IA) s’està convertint en un element imprescindible en múltiples àmbits tecnològics. Al mateix temps que la IA s’està desenvolupant a nivell d’algorismes, també les arquitectures de processament s’estan adaptant per a un millor suport de la IA. Per millorar coneixement de les implicacions de les arquitectures amb els algoritmes de IA es fa imprescindible l’ús de noves eines que permetin una exploració adecuada i, per tant, un desenvolupament d’algorismes i arquitectures òptimes i adaptats a les necessitats particulars del problema a resoldre per la IA. En aquest projecte es desenvoluparà el suport necessari per a permetre l’us de la convolució "DeepWise Separable"en una plataforma d’entrenament e inferència de procesos de IA. La convolució s’implementarà en un sistema basat en la FPGA ALVEO de Xilinx emprant High-level sintesis. Per a demostrar les ventajes que ens proporciona la nostra implementació de la convolució, compararem la nostra implementació amb una implementació de la convolució directa. Mcmullen García, BM. (2021). Deepwise Separable Convolution Support in Neural Network Platform. Universitat Politècnica de València. http://hdl.handle.net/10251/171766},
    	author = "Mcmullen García, Brian Miguel",
    	copyright = "http://rightsstatements.org/vocab/InC/1.0/ info:eu-repo/semantics/openAccess",
    	keywords = "Architectures ; ARQUITECTURA Y TECNOLOGIA DE COMPUTADORES ; Arquitecturas ; Artificial intelligence ; Convoluciones ; Convolution ; Entrenamiento ; Grado en Ingeniería Informática-Grau en Enginyeria Informàtica ; Inference ; Inferencia ; Inteligencia artificial ; Training",
    	language = "eng",
    	publisher = "Universitat Politècnica de València",
    	title = "Deepwise Separable Convolution Support in Neural Network Platform",
    	year = 2021
    }
    
  20. Roberto Díaz-Cano Lozano. Desenvolupament de processos d’entrenament i inferència amb aritmètica en coma flotant de 16 bits sobre la plataforma Jetson Xavier. 2021. BibTeX

    @misc{Díaz-CanoLozanoRoberto2021Ddpd,
    	abstract = "[CA] La intel·ligència artificial (IA) s’està convertint en un element imprescindible en diferents àmbits de la informàtica. Al mateix temps que la IA s’està desenvolupant a escala d’algoritmes, també les arquitectures de processament s’hi estan adaptant per donar un millor suport. Per aquest motiu en el present treball es desenvolupa un suport en aritmètica en coma flotant de 16 bits sobre una plataforma d’entrenament i inferència de xarxes neuronals. Aquest desenvolupament es realitza sobre el dispositiu Jetson AGX Xavier de l’empresa NVIDIA, el qual està destinat a aplicacions d’intel·ligència artificial, com ara l’aprenentatge profund (deep learning). L’objectiu és dotar a l’aplicació HELENNA d’un suport que li permeta utilitzar els nombres en coma flotant de 16 bits sobre la GPU del dispositiu de NVIDIA, a través del llenguatge de programació CUDA. D’aquesta manera es podrà aconseguir un millor aprofitament dels recursos i del consum energètic, ja que amb l’aritmètica de precisió reduïda es pot incrementar l’eficiència dels entrenaments amb xarxes neuronals. [ES] La inteligencia artificial (IA) se está convirtiendo en un elemento imprescindible en diferentes ámbitos de la informática. Al mismo tiempo que la IA se está desarrollando a escala de algoritmos, también las arquitecturas de procesamiento se están adaptando para dar un mejor apoyo. Por este motivo en el presente trabajo se desarrolla un soporte en aritmética en coma flotante de 16 bits sobre una plataforma de entrenamiento e inferencia de redes neuronales. Este desarrollo se realiza sobre el dispositivo Jetson AGX Xavier de la empresa NVIDIA, el cual está destinado a aplicaciones de inteligencia artificial, como el aprendizaje profundo (deep learning). El objetivo es dar a la aplicación HELENNA de un soporte que le permita utilizar nombres en coma flotante de 16 bits sobre la GPU del dispositivo de NVIDIA, a través del lenguaje de programación CUDA. De este modo se podrá conseguir un mejor aprovechamiento de los recursos y del consumo energético, puesto que con la aritmética de precisión reducida se puede incrementar la eficiencia de los entrenamientos con redes neuronales. [EN] Artificial Intelligence (AI) is becoming an important element in different areas of computing. At the same time that AI is developing at the algorithm scale, processing architectures are also adapting to give a better support to it. For this reason, in the present project a 16-bit floating point arithmetic support is developed on a neural network training and inference platform. This development is carried out on the Jetson AGX Xavier device from NVIDIA, which is destined for artificial intelligence applications, like deep learning. The objective is to give the HELENNA application support that allows it to use 16-bit floating point numbers on the GPU of the NVIDIA device, through the CUDA programming language. In this way, better use of resources and energy consumption can be achieved, since with reduced precision arithmetic, the efficiency of training with neural networks can be increased. Díaz-Cano Lozano, R. (2021). Desenvolupament de processos d’entrenament i inferència amb aritmètica en coma flotant de 16 bits sobre la plataforma Jetson Xavier. Universitat Politècnica de València. http://hdl.handle.net/10251/172214",
    	author = "Díaz-Cano Lozano, Roberto",
    	copyright = "http://rightsstatements.org/vocab/InC/1.0/ info:eu-repo/semantics/openAccess",
    	keywords = "16-bit floating point arithmetic ; Aprenentatge profund ; Aritmètica en coma flotant de 16 bits ; ARQUITECTURA Y TECNOLOGIA DE COMPUTADORES ; Artificial Intelligence ; CUDA ; Deep learning ; GPU ; Grado en Ingeniería Informática-Grau en Enginyeria Informàtica ; HELENNA ; Intel·ligència artificial ; Neuronal networks ; Xarxes neuronals",
    	language = "cat",
    	publisher = "Universitat Politècnica de València",
    	title = "Desenvolupament de processos d’entrenament i inferència amb aritmètica en coma flotant de 16 bits sobre la plataforma Jetson Xavier",
    	year = 2021
    }
    
  21. Pere Díaz Bou. Desarrollo de Soporte de Redes Recurrentes en Plataforma de Entrenamiento. 2021. BibTeX

    @misc{DíazBouPere2021DdSd,
    	abstract = "[ES] La investigación en inteligencia artificial está creciendo a un ritmo constante y gracias a ello, se están resolviendo problemas que hasta hace poco parecían imposibles. Las Redes Neuronales son unos de los algoritmos mas usados para resolver estos problemas, y existen mas de un tipo de Red Neuronal. Cada una con cierta utilidad y complejidad. En este trabajo realizamos un análisis de las Redes Recurrentes. En concreto, analizamos su estructura y su funcionamiento. Posteriormente, implementaremos las Redes Recurrentes en HELENNA, una plataforma de entrenamiento e inferencia de Redes Neuronales. Para acabar, realizamos una evaluación funcional de la Red Recurrente para validar la correcta implementación. La implementación de la Red Neuronal Recurrentes será capaz de resolver problemas básicos como la predicción de dígitos escritos a mano con un porcentaje de precisión alto. [EN] Research in artificial intelligence is growing at incredible rates which allowed us to solve problems that seemed impossible before. One of the best known algorithms to solve artificial intelligence problems is neural networks. There are different types of neural networks, with different complexities and applications. Moreover, we will implement Recurrent Neural Networks in HELENNA, a training and inference plataform for Neural Networks. At last, we will evaluate the functional aspect of the Recurrent Neural Network to validate the correctness of it. Our recurrent neural network implementation will be able to predict with high accuracy handwritten digits. [CA] La recerca en intel·ligència artificial està creixent constantment i gràcies a això, ens ha permet resoldre problemes que abans pareixien impossibles. Les Xarxes Neuronals son uns dels algorismes mes usats per a resoldre aquests problemes. Hi ha mes de un tipus de Xarxa Neuronal, cada una usada per a diferents aplicacions i, amb diferents complexitats. A més a més, implementarem les Xarxes Recurrents en HELENNA, una plataforma d’entrenament i inferència de Xarxes Neuronals. Per finalitzar, farem una avaluació funcional de la Xarxa Recurrent per validar la correcta implementació. La implementació de la Xarxa Neuronal recurrent será capaç de resoldre problemes senzills com la predicció de dígits escrits a mà amb un alt nivell de precisió. Díaz Bou, P. (2021). Desarrollo de Soporte de Redes Recurrentes en Plataforma de Entrenamiento. Universitat Politècnica de València. http://hdl.handle.net/10251/173384",
    	author = "Díaz Bou, Pere",
    	copyright = "http://creativecommons.org/licenses/by/4.0/ info:eu-repo/semantics/openAccess",
    	keywords = "ARQUITECTURA Y TECNOLOGIA DE COMPUTADORES ; Artificial Intelligence ; Grado en Ingeniería Informática-Grau en Enginyeria Informàtica ; Inteligencia artificial ; Plataforma de entrenamiento ; Recurrent Neural Networks ; Redes neuronales recurrentes ; Training Plataform",
    	language = "spa",
    	publisher = "Universitat Politècnica de València",
    	title = "Desarrollo de Soporte de Redes Recurrentes en Plataforma de Entrenamiento",
    	year = 2021
    }
    
  22. Jose Duro, Salvador Petit, Maria E Gomez and Julio Sahuquillo. Segment Switching: A New Switching Strategy for Optical HPC Networks. IEEE access 9:43095-43106, 2021. BibTeX

    @article{DuroJose2021SSAN,
    	abstract = "Photonics are becoming realistic technologies for implementing interconnection networks in near future Exascale supercomputer systems. Photonics present key features to design high-performance and scalable supercomputer networks, such as higher bandwidth and lower latencies than their electronic supercomputer networks counterparts. Some research work is focused on conventional network topologies built with photonic technologies, with the aim of taking advantage of photonic characteristics. Nevertheless, these approaches fail in that they keep low the network utilization. We looked into this downside and we found that circuit switching was the main performance limitation. In this article we propose a new switching mechanism, called Segment Switching , to address this constraint and improve the network utilization. Segment Switching splits the circuit in segments of the whole path, and uses buffering on selected nodes on the network. Experimental results show that the devised approach significantly outperforms photonic circuit switching in conventional torus and fat tree networks by 70% and 90%, respectively.",
    	author = "Duro, Jose and Petit, Salvador and Gomez, Maria E and Sahuquillo, Julio",
    	issn = "2169-3536",
    	journal = "IEEE access",
    	keywords = "Bandwidth ; exascale supercomputers ; Integrated circuit interconnections ; Interconnection networks ; Optical buffering ; Optical switches ; photonic technology ; Photonics ; simulation ; Switching circuits ; Wavelength division multiplexing",
    	language = "eng",
    	pages = "43095-43106",
    	publisher = "IEEE",
    	title = "Segment Switching: A New Switching Strategy for Optical HPC Networks",
    	volume = 9,
    	year = 2021
    }
    
  23. Marta Navarro, Lucia Pons and Julio Sahuquillo. Hy-Sched: A Simple Hyperthreading-Aware Thread to Core Allocation Strategy. IEEE computer architecture letters 20(1):26-29, 2021. BibTeX

    @article{NavarroMarta2021HASH,
    	abstract = "Simultaneous multithreading processors are dominating the High Computing Performance market. Among these processors, those supporting only two threads are being the most widely deployed in current systems, thus, only two threads compete at run-time for intra-core resources. The performance of these processors can be boosted by selecting symbiotic applications to be executed on the same core, which reduces the inter-application interference considerably. In this letter we propose Hy-Sched, an scheduling algorithm that exploits symbiosis to make pairs of applications to be launched on the same physical core. The proposed approach lies on the categories of the Top-Down Method for Performance Analysis. Different variants of the algorithm are explored. Experimental results show that Hy-Sched outperforms Linux on average by 15 percent in the studied workloads.",
    	author = "Navarro, Marta and Pons, Lucia and Sahuquillo, Julio",
    	issn = "1556-6056",
    	journal = "IEEE computer architecture letters",
    	keywords = "Benchmark testing ; Hardware ; Instruction sets ; Interference ; intra-core interference ; Linux ; Mathematical model ; Simultaneous multithreading ; Symbiosis ; symbiotic applications",
    	language = "eng",
    	number = 1,
    	pages = "26-29",
    	publisher = "IEEE",
    	title = "Hy-Sched: A Simple Hyperthreading-Aware Thread to Core Allocation Strategy",
    	volume = 20,
    	year = 2021
    }
    
  24. Josue Feliu, Ajeya Naithani, Julio Sahuquillo, Salvador Petit, Moinuddin K Qureshi and Lieven Eeckhout. VMT: Virtualized Multi-Threading for Accelerating Graph Workloads on Commodity Processors. IEEE transactions on computers, pages 1-1, 2021. BibTeX

    @article{FeliuJosue2021VVMf,
    	abstract = "Modern-day graph workloads operate on huge graphs through pointer chasing which leads to high last-level cache (LLC) miss rates and limited memory-level parallelism (MLP). Simultaneous Multi-Threading (SMT) effectively hides the memory access latencies for multi-threaded graph workloads provided that sufficient threads are supported in hardware. Unfortunately, providing a sufficiently large number of physical threads incurs an unjustifiably high hardware cost for commodity SMT processors which typically implement only two physical hardware threads. Ideally, we would like to achieve aggressive-SMT performance when running graph workloads on modest commodity processors. In this paper, we propose Virtualized Multi-Threading (VMT), a low-overhead multi-threading paradigm for accelerating graph workloads on commodity processors. Unlike prior multi-threading paradigms, VMT virtualizes both the physical hardware threads and the architecture state: VMT maps a large number of logical software threads to a small number of physical hardware threads, while maintaining the architecture state of the logical threads in the processor's cache hierarchy. Implemented on top of a quad-core 2-way SMT processor, VMT achieves an average speedup of 1.74x for a set of representative graph workloads, while incurring minimal hardware cost (195 bytes per core to support up to 32 logical threads). VMT's low hardware cost paves the way for implementation in commodity processors.",
    	author = "Feliu, Josue and Naithani, Ajeya and Sahuquillo, Julio and Petit, Salvador and Qureshi, Moinuddin K and Eeckhout, Lieven",
    	issn = "0018-9340",
    	journal = "IEEE transactions on computers",
    	keywords = "Architecture State ; Computer architecture ; Graph Workloads ; Hardware ; Instruction sets ; Message systems ; Multi-Threading ; Registers ; Software ; Switches ; Virtualization",
    	language = "eng",
    	pages = "1-1",
    	publisher = "IEEE",
    	title = "VMT: Virtualized Multi-Threading for Accelerating Graph Workloads on Commodity Processors",
    	year = 2021
    }
    
  25. José Duro Gómez. Photonic Interconnection Networks for Exascale Computers. 2021. BibTeX

    @misc{DuroGómezJosé2021PINf,
    	abstract = "[ES] En los últimos años, distintos proyectos alrededor del mundo se han centrado en el diseño de supercomputadores capaces de alcanzar la meta de la computación a exascala, con el objetivo de soportar la ejecución de aplicaciones de gran importancia para la sociedad en diversos campos como el de la salud, la inteligencia artificial, etc. Teniendo en cuenta la creciente tendencia de la potencia computacional en cada generación de supercomputadores, este objetivo se prevee accesible en los próximos años. Alcanzar esta meta requiere abordar diversos retos en el diseño y desarrollo del sistema. Uno de los principales es conseguir unas comunicaciones rápidas y eficientes entre el inmenso número de nodos de computo y los sitemas de memoria. La tecnología fotónica proporciona ciertas ventajas frente a las redes eléctricas, como un mayor ancho de banda en los enlaces, un mayor paralelismo a nivel de comunicaciones gracias al DWDM o una mejor gestión del cableado gracias a su reducido tamaño. En la tesis se ha desarrollado un estudio de viabilidad y desarrollo de redes de interconexión haciendo uso de la tecnología fotónica para los futuros sistemas a exaescala dentro del proyecto europeo ExaNeSt. En primer lugar, se ha realizado un análisis y caracterización de aplicaciones exaescala. Este análisis se ha utilizado para conocer el comportamiento y requisitos de red que presentan las aplicaciones, y con ello guiarnos en el diseño de la red del sistema. El análisis considera tres parámetros: la distribución de mensajes en base a su tamaño y su tipo, el consumo de ancho de banda requerido a lo largo de la ejecución y la matriz de comunicación espacial entre los nodos. El estudio revela la necesidad de una red eficiente y rápida, debido a que la mayoría de las comunaciones se realizan en burst y con mensajes de un tamaño medio inferior a 50KB. A continuación, la tesis se centra en identificar los principales elementos que diferencian las redes fotónicas de las eléctricas. Identificamos una secuencia de pasos en el diseño de un simulador, ya sea haciéndolo desde cero con tecnología fotónica o adaptando un simulador de redes eléctricas existente para modelar la fotónica. Después se han realizado dos estudios de rendimiento y comparativas entre las actuales redes eléctricas y distintas configuraciones de redes fotónicas utilizando topologías clásicas. En el primer estudio, realizado tanto con tráfico sintético como con trazas de ExaNeSt en un toro, fat tree y dragonfly, se observa como la tecnología fotónica supone una clara mejora respecto a la eléctrica. Además, el estudio muestra que el parámetro que más afecta al rendimiento es el ancho de banda del canal fotónico. El segundo estudio muestra el comportamiento y rendimiento de aplicaciones reales en simulaciones a gran escala en una topología jellyfish. En este estudio se confirman las conclusiones obtenidas en el anterior, revelando además que la tecnología fotónica permite reducir la complejidad de algunas topologías, y por ende, el coste de la red. En los estudios realizados se ha observado una baja utilización de la red debido a que las topologías utilizadas para redes eléctricas no aprovechan las características que proporciona la tecnología fotónica. Por ello, se ha propuesto Segment Switching, una estrategia de conmutación orientada a reducir la longitud de las rutas mediante el uso de buffers intermedios. Los resultados experimentales muestran que cada topología tiene sus propios requerimientos. En el caso del toro, el mayor rendimiento se obtiene con un mayor número de buffers en la red. En el fat tree el parámetro más importante es el tamaño del buffer, obteniendo unas prestaciones similares una configuración con buffers en todos los switches que la que los ubica solo en el nivel superior. En resumen, esta tesis estudia el uso de la tecnología fotónica para las redes de sistemas a exascala y propone aprovechar [CA] Els darrers anys, múltiples projectes de recerca a tot el món s'han centrat en el disseny de superordinadors capaços d'assolir la barrera de computació exascala, amb l'objectiu de donar suport a l'execució d'aplicacions importants per a la nostra societat, com ara salut, intel·ligència artificial, meteorologia, etc. Segons la tendència creixent en la potència de càlcul en cada generació de superordinadors, es preveu assolir aquest objectiu en els propers anys. No obstant això, assolir aquest objectiu requereix abordar diferents reptes importants en el disseny i desenvolupament del sistema. Un dels principals és aconseguir comunicacions ràpides i eficients entre l'enorme nombre de nodes computacionals i els sistemes de memòria. La tecnologia fotònica proporciona diversos avantatges respecte a les xarxes elèctriques actuals, com ara un major ample de banda als enllaços, un major paral·lelisme de la xarxa gràcies a DWDM o una millor gestió del cable a causa de la seva mida molt més xicoteta. En la tesi, s'ha desenvolupat un estudi de viabilitat i desenvolupament de xarxes d'interconnexió mitjançant tecnologia fotònica per a futurs sistemes exascala dins del projecte europeu ExaNeSt. En primer lloc, s'ha dut a terme un estudi de caracterització d'aplicacions exascala dels requisits de xarxa. Els resultats de l'anàlisi ajuden a entendre els requisits de xarxa de les aplicacions exascale i, per tant, ens guien en el disseny de la xarxa del sistema. Aquesta anàlisi considera tres paràmetres principals: la distribució dels missatges en funció de la seva mida i tipus, el consum d'ample de banda requerit durant tota l'execució i els patrons de comunicació espacial entre els nodes. L'estudi revela la necessitat d'una xarxa d'interconnexió ràpida i eficient, ja que la majoria de comunicacions consisteixen en ràfegues de transmissions, cadascuna amb una mida mitjana de missatge de 50 KB. A continuació, la tesi se centra a identificar els principals elements que diferencien les xarxes fotòniques de les elèctriques. Identifiquem una seqüència de passos en el disseny i implementació d'un simulador: tractar la tecnologia fotònica des de zero o per ampliar un simulador de xarxa elèctrica existent per modelar la fotònica. Després, es presenten dos estudis principals de comparació de rendiment entre xarxes elèctriques i diferents configuracions de xarxes fotòniques mitjançant topologies clàssiques. En el primer estudi, realitzat tant amb trànsit sintètic com amb traces d'ExaNeSt en un toro, fat tree i dragonfly, vam trobar que la tecnologia fotònica representa una millora notable respecte a la tecnologia elèctrica. A més, l'estudi mostra que el paràmetre que més afecta el rendiment és l'amplada de banda del canal fotònic. Aquest darrer estudi analitza el rendiment d'aplicacions reals en simulacions a gran escala en una topologia jellyfish. Els resultats d'aquest estudi corroboren les conclusions obtingudes en l'anterior, revelant també que la tecnologia fotònica permet reduir la complexitat d'algunes topologies i, per tant, el cost de la xarxa. En els estudis anteriors ens adonem que la xarxa estava infrautilitzada principalment perquè les topologies estudiades per a xarxes elèctriques no aprofiten les característiques proporcionades per la tecnologia fotònica. Per aquest motiu, proposem Segment Switching, una estratègia de commutació destinada a reduir la longitud de les rutes mitjançant la implementació de memòries intermèdies en nodes intermedis al llarg de la ruta. Els resultats experimentals mostren que cadascuna de les topologies estudiades presenta diferents requisits de memòria intermèdia. Per al toro, com més gran siga el nombre de memòries intermèdies a la xarxa, major serà el rendiment. Per al fat tree, el paràmetre clau és la mida de la memòria intermèdia, aconseguint un rendiment similar tant amb una configuració amb memòria intermèdia en tots els co [EN] In the last recent years, multiple research projects around the world have focused on the design of supercomputers able to reach the exascale computing barrier, with the aim of supporting the execution of important applications for our society, such as health, artificial intelligence, meteorology, etc. According to the growing trend in the computational power in each supercomputer generation, this objective is expected to be reached in the coming years. However, achieving this goal requires addressing distinct major challenges in the design and development of the system. One of the main ones is to achieve fast and efficient communications between the huge number of computational nodes and the memory systems. Photonics technology provides several advantages over current electrical networks, such as higher bandwidth in the links, greater network parallelism thanks to DWDM, or better cable management due to its much smaller size. In this thesis, a feasibility study and development of interconnection networks have been developed using photonics technology for future exascale systems within the European project ExaNeSt. First, a characterization study of exascale applications from the network requirements has been carried out. The results of the analysis help understand the network requirements of exascale applications, and thereby guide us in the design of the system network. This analysis considers three main parameters: the distribution of the messages based on their size and type, the required bandwidth consumption throughout the execution, and the spatial communication patterns between the nodes. The study reveals the need for a fast and efficient interconnection network, since most communications consist of bursts of transmissions, each with an average message size of 50 KB. Next, this dissertation concentrates on identifying the main elements that differentiate photonic networks from electrical ones. We identify a sequence of steps in the design and implementation of a simulator either i) dealing with photonic technology from scratch or ii) to extend an existing electrical network simulator in order to m",
    	author = "Duro Gómez, José",
    	copyright = "http://rightsstatements.org/vocab/InC/1.0/ info:eu-repo/semantics/openAccess",
    	keywords = "ARQUITECTURA Y TECNOLOGIA DE COMPUTADORES ; Computacion a exaescala ; Exascale Supercomputers ; Interconnection networks ; Marcos de simulación ; Optical networks ; Photonic networks ; Photonic Technology ; Redes de interconexión ; Redes fotónicas ; Redes ópticas ; Simulation frameworks ; Supercomputadores a exaescala ; Tecnología fotónica",
    	language = "eng",
    	publisher = "Universitat Politècnica de València",
    	title = "Photonic Interconnection Networks for Exascale Computers",
    	year = 2021
    }
    
  26. Dezhen Wu. Estudio de prestaciones de cargas de latencia crítica en sistemas SMT. 2021. BibTeX

    @misc{WuDezhen2021Edpd,
    	abstract = "[ES] La computación en la nube (cloud computing) ofrece servicios de computación bajo demanda a través de una red (habitualmente internet). Es un servicio ampliamente utilizado en la actualidad y, por tanto, existe una gran cantidad de trabajo centrado en el análisis y la mejora de prestaciones de este tipo de servicios. El presente proyecto se centra en cargas de latencia crítica, que son aquellas que deben garantizar una latencia máxima dentro de un umbral para evitar que los usuarios sufran una degradación de prestaciones, cuantificada en términos de calidad de servicio. Aplicaciones como los servicios de búsqueda, el reconocimiento de texto o imágenes y la consulta de bases de datos son aplicaciones de latencia crítica típicas. El presente proyecto propone analizar las prestaciones de este tipo de cargas cuando se ejecutan en un procesador con soporte para la ejecución simultanea de hilos (SMT) utilizando la herramienta perf para determinar las estructuras del procesador que principalmente limitan sus prestaciones. [EN] Cloud computing offers computing services on demand over a network (usually the internet). It is a widely used service today and, therefore, there is a large amount of work focused on the analysis and improvement of the benefits of this type of service. This project focuses on critical latency loads, which are those that must guarantee maximum latency within a threshold to prevent users from suffering a performance degradation, quantified in terms of quality of service. Applications such as search services, text or image recognition, and database queries are typical latency-critical applications. This project proposes to analyze the performance of this type of workloads when running on a simultaneous multithreading (SMT) processor using the perf tool to determine the processor structures that mainly limit their performance. Wu, D. (2021). Estudio de prestaciones de cargas de latencia crítica en sistemas SMT. Universitat Politècnica de València. http://hdl.handle.net/10251/164602",
    	author = "Wu, Dezhen",
    	copyright = "http://rightsstatements.org/vocab/InC/1.0/ info:eu-repo/semantics/openAccess",
    	keywords = "ARQUITECTURA Y TECNOLOGIA DE COMPUTADORES ; Cloud computing ; Computación en la nube ; Latencia crítica ; Latency-critical ; Máster Universitario en Ingeniería de Computadores y Redes-Màster Universitari en Enginyeria de Computadors i Xarxes ; Perf ; SMT ; TailBench",
    	language = "spa",
    	publisher = "Universitat Politècnica de València",
    	title = "Estudio de prestaciones de cargas de latencia crítica en sistemas SMT",
    	year = 2021
    }
    
  27. Miguel Antonio Avargues Gutiérrez. Análisis de requerimientos y diseño de un controlador de memoria principal no volátil. 2021. BibTeX

    @misc{AvarguesGutiérrezMiguelAntonio2021Adry,
    	abstract = "[ES] En la actualidad debido a la descentralización de la computación, la mayoría de los cálculos se realizan en servidores que ejecutan cargas de trabajo pesadas. Usualmente, estos servidores ejecutan aplicaciones que hacen elevado uso de memoria principal. Esto conlleva que las memorias DRAM convencionales no almacenan todos los datos necesarios para la ejecución de estas aplicaciones, siendo así el principal cuello de botella en estos sistemas. Las memorias NVRAM intentan solucionar este problema ofreciendo mayor densidad de almacenamiento de información a coste de latencias de acceso mayores. En este proyecto se implementa un controlador de memoria principal para memorias NVRAM con el objetivo de mejorar las prestaciones de aplicaciones que acceden a este tipo de memorias. [EN] Currently due to the computation decentralization, most of the computing is made on servers who perform heavy workloads. Usually, these servers run applications which make high use of main memory. This fact makes conventional DRAM memories unable to store all data needed in order to run these applications, making them the main bottleneck on those systems. NVRAM memories try to solve this bottleneck by offering more memory storage density at the cost of higher access latency. In this project a memory controller for NVRAM memories is implemented, with the goal of increasing the performance of applications that access these types of memories. [CA] En l’actualitat a causa de la descentralització de la computació, la majoria dels càlculs es realitzen en servidors que executen càrregues de treball pesades. Usualment, aquests servidors executen aplicacions que fan un ús elevat de memòria principal. Això implica que les memòries DRAM convencionals no emmagatzemen totes les dades necessàries amb tal d’executar aquestes aplicacions, sent açò el principal coll de botella en aquests sistemes. Les memòries NVRAM intenten solucionar aquest problema oferint una major densitat de emmagatzenament d’informació a canvi de major latències d’accés. En aquest projecte s’implementa un controlador de memòria principal per a memòries NVRAM amb l’objectiu de millorar les prestacions d’aplicacions que accedeixen a aquests tipus de memories. Avargues Gutiérrez, MA. (2021). Análisis de requerimientos y diseño de un controlador de memoria principal no volátil. Universitat Politècnica de València. http://hdl.handle.net/10251/174123",
    	author = "Avargues Gutiérrez, Miguel Antonio",
    	copyright = "http://creativecommons.org/licenses/by-sa/4.0/ info:eu-repo/semantics/openAccess",
    	keywords = "ARQUITECTURA Y TECNOLOGIA DE COMPUTADORES ; Controlador de memoria ; Gem5 ; Grado en Ingeniería Informática-Grau en Enginyeria Informàtica ; Main memory ; Memoria no volátil ; Memoria principal ; Memory controller ; Non-volatile memory ; NVMain ; NVRAM",
    	language = "spa",
    	publisher = "Universitat Politècnica de València",
    	title = "Análisis de requerimientos y diseño de un controlador de memoria principal no volátil",
    	year = 2021
    }
    
  28. Carlos Navarro, Josué Feliu, Salvador Petit, Maria E Gomez and Julio Sahuquillo. Bandwidth-Aware Dynamic Prefetch Configuration for IBM POWER8. IEEE Transactions on Parallel and Distributed Systems PP (99), 2020. BibTeX

    @article{ 10.1109/tpds.2020.2982392,
    	author = "Navarro, Carlos and Feliu, Josu{\'e} and Petit, Salvador and Gomez, Maria E. and Sahuquillo, Julio",
    	abstract = "Advanced hardware prefetch engines are being integrated in current high-performance processors. Prefetching can boost the performance of most applications, however, the induced bandwidth consumption can lead the system to a high contention for main memory bandwidth, which is a scarce resource in current multicores. In such a case, the system performance can be severely damaged. This work characterizes the applications’ behavior in an IBM POWER8 machine, which presents many prefetch settings,varying the bandwidth contention degree. The study reveals that the best prefetch setting for each application depends on the main memory bandwidth availability, that is, it depends on the co-running applications. Based on this study, we propose Bandwidth-AwarePrefetch Configuration (BAPC) a scalable adaptive prefetching algorithm that improves the performance of multi-program workloads. BAPC increases the performance of the applications in a 8%, 11%, and 12% for workload mixes composed of 6, 8, and 10 applications over the IBM POWER8 default configuration. In addition to performance, BAPC reduces bandwidth consumption in 39%, 42%, and 45%, respectively.",
    	journal = "IEEE Transactions on Parallel and Distributed Systems PP",
    	number = 99,
    	title = "{B}andwidth-{A}ware {D}ynamic {P}refetch {C}onfiguration for {IBM} {POWER}8",
    	year = 2020
    }
    
  29. Tomas Picornell, Carles Hernández, Jose Flich and Jose Duato. Enforcing Predictability of Many-cores with DCFNoC. IEEE Transactions on Computers, 2020. BibTeX

    @article{ 10.1109/tc.2020.2987797,
    	author = "Picornell, Tomas and Hern{\'a}ndez, Carles and Flich, Jose and Duato, Jose",
    	abstract = "The ever need for higher performance forces industry to include technology based on multi-processors system on chip (MPSoCs) in their safety-critical embedded systems. MPSoCs include a network-on-chip (NoC) to interconnect the cores between them and with memory and the rest of shared resources. Unfortunately, the inclusion of NoCs compromises guaranteeing time predictability as network-level conflicts may occur. To overcome this problem, in this paper we propose DCFNoC, a new time-predictable NoC design paradigm where conflicts within the network are eliminated by design. This new paradigm builds on top of the Channel Dependency Graph (CDG) in order to deterministically avoid network conflicts. The network guarantees predictability to applications and is able to naturally inject messages using a TDM period equal to the optimal theoretical bound without the need of using a computationally demanding offline process. DCFNoC is integrated in a tile-based many-core system and adapted to its memory hierarchy. Our results show that DCFNoC guarantees time predictability avoiding network interference among multiple running applications. DCFNoC always guarantees performance and also improves wormhole performance in a 4 × 4 setting by a factor of 3.7× when interference traffic is injected. For a 8 × 8 network differences are even larger. In addition, DCFNoC obtains a total area saving of 10.79% over a standard wormhole implementation.",
    	journal = "IEEE Transactions on Computers",
    	title = "{E}nforcing {P}redictability of {M}any-cores with {DCFN}o{C}",
    	year = 2020
    }
    
  30. Miguel Gorgues and Jose Flich. A Low-Latency and Flexible TDM NoC for Strong Isolation in Security-Critical Systems. 2019 IEEE 13th International Symposium on Embedded Multicore/Many-core Systems-on-Chip (MCSoC), 2019. BibTeX

    @article{ 10.1109/mcsoc.2019.00029,
    	author = "Gorgues, Miguel and Flich, Jose",
    	journal = "2019 IEEE 13th International Symposium on Embedded Multicore/Many-core Systems-on-Chip (MCSoC)",
    	title = "{A} {L}ow-{L}atency and {F}lexible {TDM} {N}o{C} for {S}trong {I}solation in {S}ecurity-{C}ritical {S}ystems",
    	year = 2019
    }
    
  31. Francisco Candel, Alejandro Valero, Salvador Petit and Julio Sahuquillo. An Aging-Aware GPU Register File Design Based on Data Redundancy. IEEE Transactions on Computers 1(68):4-20, 2019. BibTeX

    @article{ 10.1109/tc.2018.2849376,
    	author = "Candel, Francisco and Valero, Alejandro and Petit, Salvador and Sahuquillo, Julio",
    	abstract = "Nowadays, GPUs sit at the forefront of high-performance computing thanks to their massive computational capabilities. Internally, thousands of functional units, architected to be fed by large register files, fuel such a performance. At deep nanometer technologies, the SRAM memory cells that implement GPU register files are very sensitive to the Negative Bias Temperature Instability (NBTI) effect. NBTI ages cell transistors by degrading their threshold voltage $V_{th}$ over the lifetime of the GPU. This degradation, which manifests when a cell keeps the same logic value for a relatively long period of time, compromises the cell read stability and increases the transistor switching delay, which can lead to wrong read values and eventually exceed the processor cycle time, respectively, so resulting in faulty operation. This work proposes architectural mechanisms leveraging the redundancy of the data stored in GPU register files to attack NBTI aging. The proposed mechanisms are based on data compression, power gating, and register address rotation techniques. All these mechanisms working together balance the distribution of logic values stored in the cells along the execution time, reducing both the overall $V_{th}$ degradation and the increase in the transistor switching delays. Experimental results show that a conventional GPU register file suffers the worst case for NBTI, since a significant fraction of the cells maintain the same logic value during the entire application execution (i.e., a 100 percent ‘0’ and ‘1’ duty cycle distributions). On average, the proposal reduces these distributions by 58 and 68 percent, respectively, which translates into $V_{th}$ degradation savings by 54 and 62 percent, respectively.",
    	journal = "IEEE Transactions on Computers",
    	number = 68,
    	pages = "4-20",
    	title = "{A}n {A}ging-{A}ware {GPU} {R}egister {F}ile {D}esign {B}ased on {D}ata {R}edundancy",
    	volume = 1,
    	year = 2019
    }
    
  32. Jose Puche, Salvador Petit, Maria E Gomez and Julio Sahuquillo. An efficient cache flat storage organization for multithreaded workloads for low power processors. Future Generation Computer Systems, 2019. BibTeX

    @article{ 10.1016/j.future.2019.11.024,
    	author = "Puche, Jose and Petit, Salvador and Gomez, Maria E. and Sahuquillo, Julio",
    	journal = "Future Generation Computer Systems",
    	title = "{A}n efficient cache flat storage organization for multithreaded workloads for low power processors",
    	year = 2019
    }
    
  33. Tomas Picornell, Carles Hernández, Jose Duato and Jose Flich. DCFNoC: A Delayed Conflict-Free Time Division Multiplexing Network on Chip. 56th Annual Design Automation Conference 2019, 2019. BibTeX

    @article{ 10.1145/3316781.3317794,
    	author = "Picornell, Tomas and Hern{\'a}ndez, Carles and Duato, Jose and Flich, Jose",
    	abstract = "The adoption of many-cores in safety-critical systems requires real-time capable networks on chip (NoC). In this paper we propose a new time-predictable NoC design paradigm where contention within the network is eliminated. This new paradigm builds on the Channel Dependency Graph (CDG) and guarantees by design the absence of contention. Our delayed conflict-free NoC (DCFNoC) is able to naturally inject messages using a TDM period equal to the optimal theoretical bound and without the need of using a computationally demanding offline process. Results show that DCFNoC guarantees time predictability with very low implementation cost.",
    	journal = "56th Annual Design Automation Conference 2019",
    	title = "{DCFN}o{C}: {A} {D}elayed {C}onflict-{F}ree {T}ime {D}ivision {M}ultiplexing {N}etwork on {C}hip",
    	year = 2019
    }
    
  34. Francisco Candel, Alejandro Valero, Salvador Petit and Julio Sahuquillo. Efficient Management of Cache Accesses to Boost GPGPU Memory Subsystem Performance. IEEE Transactions on Computers 10(68):1442-1454, 2019. BibTeX

    @article{ 10.1109/tc.2019.2907591,
    	author = "Candel, Francisco and Valero, Alejandro and Petit, Salvador and Sahuquillo, Julio",
    	abstract = "To support the massive amount of memory accesses that GPGPU applications generate, GPU memory hierarchies are becoming more and more complex, and the Last Level Cache (LLC) size considerably increases each GPU generation. This paper shows that counter-intuitively, enlarging the LLC brings marginal performance gains in most applications. In other words, increasing the LLC size does not scale neither in performance nor energy consumption. We examine how LLC misses are managed in typical GPUs, and we find that in most cases the way LLC misses are managed are precisely the main performance limiter. This paper proposes a novel approach that addresses this shortcoming by leveraging a tiny additional Fetch and Replacement Cache-like structure (FRC) that stores control and coherence information of the incoming blocks until they are fetched from main memory. Then, the fetched blocks are swapped with the victim blocks (i.e., selected to be replaced) in the LLC, and the eviction of such victim blocks is performed from the FRC. This approach improves performance due to three main reasons: i) the lifetime of blocks being replaced is enlarged, ii) the main memory path is unclogged on long bursts of LLC misses, and iii) the average LLC miss latency is reduced. The proposal improves the LLC hit ratio, memory-level parallelism, and reduces the miss latency compared to much larger conventional caches. Moreover, this is achieved with reduced energy consumption and with much less area requirements. Experimental results show that the proposed FRC cache scales in performance with the number of GPU compute units and the LLC size, since, depending on the FRC size, performance improves ranging from 30% to 67% for a modern baseline GPU card, and from 32% to 118% for a larger GPU. In addition, energy consumption is reduced on average from 49% to 57% for the larger GPU. These benefits come with a small area increase (by 7.3%) over the LLC baseline.",
    	journal = "IEEE Transactions on Computers",
    	number = 68,
    	pages = "1442-1454",
    	title = "{E}fficient {M}anagement of {C}ache {A}ccesses to {B}oost {GPGPU} {M}emory {S}ubsystem {P}erformance",
    	volume = 10,
    	year = 2019
    }
    
  35. Jose Puche, Salvador Petit, Maria E Gomez and Julio Sahuquillo. FOS: a low-power cache organization for multicores. The Journal of Supercomputing 3s(75):1-32, 2019. BibTeX

    @article{ 10.1007/s11227-019-02858-x,
    	author = "Puche, Jose and Petit, Salvador and Gomez, Maria E. and Sahuquillo, Julio",
    	abstract = "The cache hierarchy of current multicore processors typically consists of one or two levels of private caches per core and a large shared last-level cache. This approach incurs area and energy wasting due to oversizing the private cache space, data replication through the inclusive cache levels, as well as the use of highly set-associative caches. In this paper, we claim that although this is the commonly adopted approach, it presents important design issues that can be addressed by a more energy efficient organization. This work proposes Flat On-chip Storage (FOS), a novel cache organization that, aimed at addressing energy and area on low-power processors, resolves the mentioned issues. For this purpose, FOS combines L2 and L3 cache levels into a single one, organized as a flat space, and composed of a pool of private small cache slices. These slices are initially powered off to save energy, and they are powered on and assigned to cores provided that the system performance is expected to improve. To provide fast and uniform access from the private L1 caches to the FOS’s cache slices, multiple architectural challenges are overcome, which entails the design of a custom optical network-on-chip. Experimental results show that FOS achieves significant energy savings on both static and dynamic energy over conventional cache organizations with the same storage capacity. FOS static energy savings are as much as 60% over an electrically connected shared cache; these savings grow up to 75% compared to optically connected baselines. Moreover, despite deactivating part of the cache space, FOS achieves similar performance values as those achieved by conventional approaches.",
    	journal = "The Journal of Supercomputing",
    	number = 75,
    	pages = "1-32",
    	title = "{FOS}: a low-power cache organization for multicores",
    	volume = "3s",
    	year = 2019
    }
    
  36. Josué Feliu, Salvador Petit and Julio Sahuquillo. Thread Isolation to Improve Symbiotic Scheduling on SMT Multicore Processors. IEEE Transactions on Parallel and Distributed Systems PP (99), 2019. BibTeX

    @article{ 10.1109/tpds.2019.2934955,
    	author = "Feliu, Josu{\'e} and Petit, Salvador and Sahuquillo, Julio",
    	abstract = "Resource sharing is a critical issue in simultaneous multithreading (SMT) processors as threads running simultaneously on an SMT core compete for shared resources. Symbiotic job scheduling, which co-schedules applications with complementary resource demands, is an effective solution to maximize hardware utilization and improve overall system performance. However, symbiotic job scheduling typically distributes threads evenly among cores, i.e., all cores get assigned the same number of threads, which we find to lead to sub-optimal performance. In this paper, we show that asymmetric schedules (i.e., schedules that assign a different number of threads to each SMT core) can significantly improve performance compared to symmetric schedules. To leverage this finding, we propose thread isolation, a technique that turns symmetric schedules into asymmetric ones yielding higher overall system performance. Thread isolation identifies SMT-adverse applications and schedules them in isolation on a dedicated core to mitigate their sharp performance degradation under SMT. Our experimental results on an IBM POWER8 processor show that thread isolation improves system throughput by up to 5.5% compared to a state-of-the-art symmetric symbiotic job scheduler.",
    	journal = "IEEE Transactions on Parallel and Distributed Systems PP",
    	number = 99,
    	title = "{T}hread {I}solation to {I}mprove {S}ymbiotic {S}cheduling on {SMT} {M}ulticore {P}rocessors",
    	year = 2019
    }
    
  37. Clara Furió, Josué Feliu, Julio Sahuquillo, Salvador Petit and Jose Duro. A Workload Generator for Evaluating SMT Real-Time Systems. 2018 International Conference on High Performance Computing & Simulation (HPCS), 2018. BibTeX

    @article{ 10.1109/hpcs.2018.00067,
    	author = "Furi{\'o}, Clara and Feliu, Josu{\'e} and Sahuquillo, Julio and Petit, Salvador and Duro, Jose",
    	journal = "2018 International Conference on High Performance Computing {\&} Simulation (HPCS)",
    	title = "{A} {W}orkload {G}enerator for {E}valuating {SMT} {R}eal-{T}ime {S}ystems",
    	year = 2018
    }
    
  38. Vicent Selfa, Julio Sahuquillo, Maria E Gomez and Crispín Gomez. Efficient selective multicore prefetching under limited memory bandwidth. Journal of Parallel and Distributed Computing (120), 2018. BibTeX

    @article{ 10.1016/j.jpdc.2018.05.002,
    	author = "Selfa, Vicent and Sahuquillo, Julio and Gomez, Maria E. and Gomez, Crisp{\'i}n",
    	abstract = "Current multicore systems implement multiple hardware prefetchers to tolerate long main memory latencies. However, memory bandwidth is a scarce shared resource which becomes critical with the increasing core count. To deal with this fact, recent works have focused on adaptive prefetchers, which control the prefetcher aggressiveness to regulate the main memory bandwidth consumption. Nevertheless, in limited bandwidth machines or under memory-hungry workloads, keeping active the prefetcher can damage the system performance and increase energy consumption. This paper introduces selective prefetching, where individual prefetchers are activated or deactivated to improve both main memory energy and performance, and proposes ADP, a prefetcher that deactivates local prefetchers in some cores when they present low performance and co-runners need additional bandwidth. Based on heuristics, an individual prefetcher is reactivated when performance enhancements are foreseen. Compared to a state-of-the-art adaptive prefetcher, ADP provides both performance and energy enhancements in limited memory bandwidth.",
    	journal = "Journal of Parallel and Distributed Computing",
    	number = 120,
    	title = "{E}fficient selective multicore prefetching under limited memory bandwidth",
    	year = 2018
    }
    
  39. Jose Flich. Exploring Manycore Architectures for Next-Generation HPC Systems through the MANGO Approac. Microprocessors and Microsystems, 2018. BibTeX

    @article{ 10.1016/j.micpro.2018.05.011,
    	author = "Flich, Jose",
    	abstract = "The Horizon 2020 MANGO project aims at exploring deeply heterogeneous accelerators for use in High-Performance Computing systems running multiple applications with different Quality of Service (QoS) levels. The main goal of the project is to exploit customization to adapt computing resources to reach the desired QoS. For this purpose, it explores different but interrelated mechanisms across the architecture and system software. In particular, in this paper we focus on the runtime resource management, the thermal management, and support provided for parallel programming, as well as introducing three applications on which the project foreground will be validated.",
    	journal = "Microprocessors and Microsystems",
    	title = "{E}xploring {M}anycore {A}rchitectures for {N}ext-{G}eneration {HPC} {S}ystems through the {MANGO} {A}pproac",
    	year = 2018
    }
    
  40. Francisco Candel, Salvador Petit, Alejandro Valero and Julio Sahuquillo. Improving GPU Cache Hierarchy Performance with a Fetch and Replacement Cache. The 24th International European Conference on Parallel and Distributed Computing, 2018. BibTeX

    @article{ gpu,
    	author = "Candel, Francisco and Petit, Salvador and Valero, Alejandro and Sahuquillo, Julio",
    	abstract = "In the last few years, GPGPU computing has become one of the most popular computing paradigms in high-performance computers due to its excellent performance to power ratio. The memory requirements of GPGPU applications widely differ from the requirements of CPU counterparts. The amount of memory accesses is several orders of magnitude higher in GPU applications than in CPU applications, and they present disparate access patterns. Because of this fact, large and highly associative Last-Level Caches (LLCs) bring much lower performance gains in GPUs than in CPUs. This paper presents a novel approach to manage LLC misses that efficiently improves LLC hit ratio, memory-level parallelism, and miss latencies in GPU systems. The proposed approach leverages a small additional Fetch and Replacement Cache (FRC) that stores control and coherence information of incoming blocks until they are fetched from main memory. Then, fetched blocks are swapped with victim blocks to be replaced in the LLC. After that, the eviction of victim blocks is performed from the FRC. This management approach improves performance due to three main reasons: i) the lifetime of blocks being replaced is increased, ii) the main memory path is unclogged on long bursts of LLC misses, and iii) the average L2 miss delaying latency is reduced. Experimental results show that our proposal increases the performance (OPC) over 25% in most of the studied applications, reaching improvements up to 400% in some applications.",
    	journal = "The 24th International European Conference on Parallel and Distributed Computing",
    	title = "{I}mproving {GPU} {C}ache {H}ierarchy {P}erformance with a {F}etch and {R}eplacement {C}ache",
    	year = 2018
    }
    
  41. Francisco Candel, Julio Sahuquillo, Salvador Petit and Alejandro Valero. Improving GPU Cache Hierarchy Performance with a Fetch and Replacement Cache: 24th International Conference on Parallel and Distributed Computing. 24th International Conference on Parallel and Distributed Computing, 2018. BibTeX

    @article{ 10.1007/978-3-319-96983-1_17,
    	author = "Candel, Francisco and Sahuquillo, Julio and Petit, Salvador and Valero, Alejandro",
    	journal = "24th International Conference on Parallel and Distributed Computing",
    	title = "{I}mproving {GPU} {C}ache {H}ierarchy {P}erformance with a {F}etch and {R}eplacement {C}ache: 24th {I}nternational {C}onference on {P}arallel and {D}istributed {C}omputing",
    	year = 2018
    }
    
  42. Lucía Pons, Vicent Selfa, Salvador Petit and Julio Sahuquillo. Improving System Turnaround Time with Intel CAT by Identifying LLC Critical Applications. Euro-Par 2018: Parallel Processing, pages 603-615, 2018. BibTeX

    @article{ 10.1007/978-3-319-96983-1_43,
    	author = "Pons, Luc{\'i}a and Selfa, Vicent and Petit, Salvador and Sahuquillo, Julio",
    	journal = "Euro-Par 2018: Parallel Processing",
    	pages = "603-615",
    	title = "{I}mproving {S}ystem {T}urnaround {T}ime with {I}ntel {CAT} by {I}dentifying {LLC} {C}ritical {A}pplications",
    	year = 2018
    }
    
  43. Jose Duro, Salvador Petit and Julio Sahuquillo. Modeling and analysis of the performance of exascale photonic networks. Concurrency and Computation Practice and Experience (31), 2018. BibTeX

    @article{ 10.1002/cpe.4773,
    	author = "Duro, Jose and Petit, Salvador and Sahuquillo, Julio",
    	abstract = "Photonics technology has become a promising and viable alternative for both on‐chip and off‐chip interconnection networks of future Exascale systems. Nevertheless, this technology is not mature enough yet in this context, so research efforts focusing on photonic networks are still required to achieve realistic suitable network implementations. In this regard, system‐level photonic network simulators can help guide designers to assess the multiple design choices. Most current research is done on electrical network simulators, whose components work widely different from photonics components. In this work, we summarize and compare the working behavior of both technologies which includes the use of optical routers, wavelength‐division multiplexing and circuit switching among others. After implementing them into a well‐known simulation framework, an extensive simulation study has been carried out using realistic photonic network configurations with synthetic and realistic traffic. Experimental results show that, compared to electrical networks, optical networks can reduce the execution time of the studied real workloads in almost one order of magnitude. Our study also reveals that the photonic configuration highly impacts on the network performance, being the bandwidth per channel and the message length the most important parameters.",
    	journal = "Concurrency and Computation Practice and Experience",
    	number = 31,
    	title = "{M}odeling and analysis of the performance of exascale photonic networks",
    	year = 2018
    }
    
  44. Jose Duro, Salvador Petit, Julio Sahuquillo and Maria E Gomez. Workload Characterization for Exascale Computing Networks. 2018 International Conference on High Performance Computing & Simulation (HPCS), 2018. BibTeX

    @article{ 10.1109/hpcs.2018.00069,
    	author = "Duro, Jose and Petit, Salvador and Sahuquillo, Julio and Gomez, Maria E.",
    	journal = "2018 International Conference on High Performance Computing {\&} Simulation (HPCS)",
    	title = "{W}orkload {C}haracterization for {E}xascale {C}omputing {N}etworks",
    	year = 2018
    }