Publications 2014-2017

Carlos Reaño and Federico Silla. A Comparative Performance Analysis of Remote GPU Virtualization over Three Generations of GPUs. In 46th International Conference on Parallel Processing Workshops, ICPP Workshops 2017, Bristol, United Kingdom, August 14-17, 2017. 2017, 121–128. URL, DOI BibTeX

@conference{ dblp:conf/icppw/reanos17,
	author = "Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "46th International Conference on Parallel Processing Workshops, ICPP Workshops 2017, Bristol, United Kingdom, August 14-17, 2017",
	crossref = "DBLP:conf/icppw/2017",
	doi = "10.1109/ICPPW.2017.29",
	pages = "121--128",
	title = "{A} {C}omparative {P}erformance {A}nalysis of {R}emote {GPU} {V}irtualization over {T}hree {G}enerations of {GPU}s",
	url = "https://doi.org/10.1109/ICPPW.2017.29",
	year = 2017
}

Roberto Peñaranda, Maria E Gomez and Pedro Lopez. A Fault-Tolerant Routing Strategy for KNS Topologies Based on Intermediate Nodes. Concurrency and Computation Practice and Experience 29(SI HiPINEB 2016), 2017. BibTeX

@article{ 10.1002/cpe.4065,
	author = "Pe{\~n}aranda, Roberto and Gomez, Maria E. and Lopez, Pedro",
	abstract = "Exascale computing systems are being built with thousands of nodes. The high number of components of these systems significantly increases the probability of failure. A key component for them is the interconnection network. If failures occur in the interconnection network, they may isolate a large fraction of the machine. For this reason, an efficient fault-tolerant mechanism is needed to keep the system interconnected, even in the presence of faults. A recently proposed topology for these large systems is the hybrid k-ary n-direct s-indirect (KNS) family that provides optimal performance and connectivity at a reduced hardware cost. This paper presents a fault-tolerant routing methodology for the KNS topology that degrades performance gracefully in presence of faults and tolerates a large number of faults without disabling any healthy computing node. In order to tolerate network failures, the methodology uses a simple mechanism. For any source-destination pair, if necessary, packets are forwarded to the destination node through a set of intermediate nodes (without being ejected from the network) with the aim of circumventing faults. The evaluation results shows that the proposed methodology tolerates a large number of faults. For instance, it is able to tolerate more than 99.5% of fault combinations when there are ten faults in a 3-D network with 1,000 nodes using only one intermediate node and more than 99.98% if two intermediate nodes are used. Furthermore, the methodology offers a gracious performance degradation. As an example, performance degrades only by 1% for a 2-D network with 1,024 nodes and 1% faulty links.",
	journal = "Concurrency and Computation Practice and Experience 29(SI HiPINEB 2016)",
	title = "{A} {F}ault-{T}olerant {R}outing {S}trategy for {KNS} {T}opologies {B}ased on {I}ntermediate {N}odes",
	year = 2017
}

Vicent Selfa, Julio Sahuquillo, Maria E Gomez and Salvador Petit. A Hardware Approach to Fairly Balance the Inter-Thread Interference in Shared Caches. IEEE Transactions on Parallel and Distributed Systems PP (99), 2017. BibTeX

@article{ 10.1109/tpds.2017.2713778,
	author = "Selfa, Vicent and Sahuquillo, Julio and Gomez, Maria E. and Petit, Salvador",
	abstract = "Shared caches have become the common design choice in the vast majority of modern multi-core and many-core processors, since cache sharing improves throughput for a given silicon area. Sharing the cache, however, has a downside: the requests from multiple applications compete among them for cache resources, so the execution time of each application increases over isolated execution. The degree in which the performance of each application is affected by the interference becomes unpredictable yielding the system to unfairness situations. This paper proposes Fair-Progress Cache Partitioning (FPCP), a low-overhead hardware-based cache partitioning approach that addresses system fairness. FPCP reduces the interference by allocating to each application a cache partition and adjusting the partition sizes at runtime. To adjust partitions, our approach estimates during multicore execution the time each application would have taken in isolation, which is challenging. The proposed approach has two main differences over existing approaches. First, FPCP distributes cache ways incrementally, which makes the proposal less prone to estimation errors. Second, the proposed algorithm is much less costly than the state-of-the-art ASM-Cache approach. Experimental results show that, compared to ASM-Cache, FPCP reduces unfairness by 48% in four-application workloads and by 28% in eight-application workloads, without harming the performance",
	journal = "IEEE Transactions on Parallel and Distributed Systems PP",
	number = 99,
	title = "{A} {H}ardware {A}pproach to {F}airly {B}alance the {I}nter-{T}hread {I}nterference in {S}hared {C}aches",
	year = 2017
}

Vicent Selfa, Julio Sahuquillo, Salvador Petit and Maria E Gomez. Application Clustering Policies to Address System Fairness with Intel’s Cache Allocation Technolo. 2017 26th International Conference on Parallel Architectures and Compilation Techniques (PACT), 2017. BibTeX

@article{ 10.1109/pact.2017.19,
	author = "Selfa, Vicent and Sahuquillo, Julio and Petit, Salvador and Gomez, Maria E.",
	journal = "2017 26th International Conference on Parallel Architectures and Compilation Techniques (PACT)",
	title = "{A}pplication {C}lustering {P}olicies to {A}ddress {S}ystem {F}airness with {I}ntel’s {C}ache {A}llocation {T}echnolo",
	year = 2017
}

Carlos Reaño, Federico Silla and Jose Duato. Enhancing the rCUDA Remote GPU Virtualization Framework: from a Prototype to a Production Solution. In Proceedings of the 17th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing, CCGRID 2017, Madrid, Spain, May 14-17, 2017. 2017, 695–698. URL, DOI BibTeX

@conference{ dblp:conf/ccgrid/reanosd17,
	author = "Rea{\~n}o, Carlos and Silla, Federico and Duato, Jose",
	booktitle = "Proceedings of the 17th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing, CCGRID 2017, Madrid, Spain, May 14-17, 2017",
	crossref = "DBLP:conf/ccgrid/2017",
	doi = "10.1109/CCGRID.2017.42",
	pages = "695--698",
	title = "{E}nhancing the r{CUDA} {R}emote {GPU} {V}irtualization {F}ramework: from a {P}rototype to a {P}roduction {S}olution",
	url = "https://doi.org/10.1109/CCGRID.2017.42",
	year = 2017
}

Francisco Candel, Alejandro Valero, Salvador Petit and Julio Sahuquillo. Exploiting Data Compression to Mitigate Aging in GPU Register Files. 2017 29th International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD), 2017. BibTeX

@article{ 10.1109/sbac-pad.2017.15,
	author = "Candel, Francisco and Valero, Alejandro and Petit, Salvador and Sahuquillo, Julio",
	abstract = "Nowadays, GPUs sit at the forefront of highperformance computing thanks to their massive computational capabilities. Internally, thousands of functional units, architected to be fed by large register files, fuel such a performance. At nanometer technologies, the SRAM cells that implement register files suffer the Negative Bias Temperature Instability (NBTI) effect, which degrades the transistor threshold voltage Vth and, in turn, can make cells faulty unreliable when they hold the same logic value for long periods of time. Fortunately, the GPU single-thread multiple-data execution model writes data in recognizable patterns. This work proposes mechanisms to detect those patterns, and to compress and shuffle the data, so that compressed register file entries can be safely powered off, mitigating NBTI aging. Experimental results show that a conventional GPU register file experiences the worst case for NBTI, since maintains cells with a single logic value during the entire application execution (i.e., a 100% ‘0’ and ‘1’ duty cycle distributions). On average, the proposal reduces these distributions by 61% and 72%, respectively, which translates into Vth degradation savings by 57% and 64%, respectively.",
	journal = "2017 29th International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD)",
	title = "{E}xploiting {D}ata {C}ompression to {M}itigate {A}ging in {GPU} {R}egister {F}iles",
	year = 2017
}

Jose Vicente Escamilla Lopez and Jose Flich. ICARO-PAPM: Congestion Management with Selective Queue Power-Gating. 2017 International Conference on High Performance Computing & Simulation (HPCS), 2017. BibTeX

@article{ 10.1109/hpcs.2017.47,
	author = "Escamilla Lopez, Jose Vicente and Flich, Jose",
	journal = "2017 International Conference on High Performance Computing {\&} Simulation (HPCS)",
	title = "{ICARO}-{PAPM}: {C}ongestion {M}anagement with {S}elective {Q}ueue {P}ower-{G}ating",
	year = 2017
}

Josué Feliu, Salvador Petit and Julio Sahuquillo. Improving IBM POWER8 Performance through Symbiotic Job Scheduling. IEEE Transactions on Parallel and Distributed Systems PP (99), 2017. BibTeX

@article{ 10.1109/tpds.2017.2691708,
	author = "Feliu, Josu{\'e} and Petit, Salvador and Sahuquillo, Julio",
	abstract = "Symbiotic job scheduling, i.e., scheduling applications that co-run well together on a core, can have a considerable impact on the performance of processors with simultaneous multithreading (SMT) cores. SMT cores share most of their microarchitectural components among the co-running applications, which causes performance interference between them. Therefore, scheduling applications with complementary resource requirements on the same core can greatly improve the throughput of the system. This paper enhances symbiotic job scheduling for the IBM POWER8 processor. We leverage the existing cycle accounting mechanism to build an interference model that predicts symbiosis between applications. The proposed models achieve higher accuracy than previous models by predicting job symbiosis from throttled CPI stacks, i.e., CPI stacks of the applications when running in the same SMT mode to consider the statically partitioned resources, but without interference from other applications. The symbiotic scheduler uses these interference models to decide, at run-time, which applications should run on the same core or on separate cores. We prototype the symbiotic scheduler as a user-level scheduler in the Linux operating system and evaluate it on an IBM POWER8 server running multiprogram workloads. The symbiotic job scheduler significantly improves performance compared to both an agnostic random scheduler and the default Linux scheduler. Across all evaluated workloads in SMT4 mode, throughput improves by 12:4% and 5:1% on average over the random and Linux schedulers, respectively.",
	journal = "IEEE Transactions on Parallel and Distributed Systems PP",
	number = 99,
	title = "{I}mproving {IBM} {POWER}8 {P}erformance through {S}ymbiotic {J}ob {S}cheduling",
	year = 2017
}

Jose Flich. MANGO: Exploring Manycore Architectures for Next-GeneratiOn HPC Systems. 2017 Euromicro Conference on Digital System Design (DSD, 2017. BibTeX

@article{ 10.1109/dsd.2017.51,
	author = "Flich, Jose",
	abstract = "The Horizon 2020 MANGO project aims at exploring deeply heterogeneous accelerators for use in High-Performance Computing systems running multiple applications with different Quality of Service (QoS) levels. The main goal of the project is to exploit customization to adapt computing resources to reach the desired QoS. For this purpose, it explores different but interrelated mechanisms across the architecture and system software. In particular, in this paper we focus on the runtime resource management, the thermal management, and support provided for parallel programming, as well as introducing three applications on which the project foreground will be validated.",
	journal = "2017 Euromicro Conference on Digital System Design (DSD",
	title = "{MANGO}: {E}xploring {M}anycore {A}rchitectures for {N}ext-{G}enerati{O}n {HPC} {S}ystems",
	year = 2017
}

Jose Duro, Salvador Petit, Julio Sahuquillo and Maria E Gomez. Modeling a Photonic Network for Exascale Computing. 2017 International Conference on High Performance Computing & Simulation (HPCS), 2017. BibTeX

@article{ 10.1109/hpcs.2017.82,
	author = "Duro, Jose and Petit, Salvador and Sahuquillo, Julio and Gomez, Maria E.",
	journal = "2017 International Conference on High Performance Computing {\&} Simulation (HPCS)",
	title = "{M}odeling a {P}hotonic {N}etwork for {E}xascale {C}omputing",
	year = 2017
}

Javier Prades, Blesson Varghese, Carlos Reaño and Federico Silla. Multi-tenant virtual GPUs for optimising performance of a financial risk application. J. Parallel Distrib. Comput. 108:28–44, 2017. URL, DOI BibTeX

@article{ dblp:journals/jpdc/pradesvrs17,
	author = "Prades, Javier and Blesson Varghese and Rea{\~n}o, Carlos and Silla, Federico",
	doi = "10.1016/j.jpdc.2016.06.002",
	journal = "J. Parallel Distrib. Comput.",
	pages = "28--44",
	title = "{M}ulti-tenant virtual {GPU}s for optimising performance of a financial risk application",
	url = "https://doi.org/10.1016/j.jpdc.2016.06.002",
	volume = 108,
	year = 2017
}

Federico Silla, Sergio Iserte, Carlos Reaño and Javier Prades. On the benefits of the remote GPU virtualization mechanism: The rCUDA case. Concurrency and Computation: Practice and Experience 29(13), 2017. URL, DOI BibTeX

@article{ dblp:journals/concurrency/sillairp17,
	author = "Silla, Federico and Sergio Iserte and Rea{\~n}o, Carlos and Prades, Javier",
	doi = "10.1002/cpe.4072",
	journal = "Concurrency and Computation: Practice and Experience",
	number = 13,
	title = "{O}n the benefits of the remote {GPU} virtualization mechanism: {T}he r{CUDA} case",
	url = "https://doi.org/10.1002/cpe.4072",
	volume = 29,
	year = 2017
}

Carlos Reaño. On the Enhancement of Remote GPU Virtualization in High Performance Clusters. Universitat Politècnica de València, 2017. BibTeX

@phdthesis{ reano2017thesis,
	author = "Rea{\~n}o, Carlos",
	school = "Universitat Polit{\`e}cnica de Val{\`e}ncia",
	title = "{O}n the {E}nhancement of {R}emote {GPU} {V}irtualization in {H}igh {P}erformance {C}lusters",
	year = 2017
}

Joan Josep Valls, Alberto Ros, Maria E Gomez and Julio Sahuquillo. A Directory Cache with Dynamic Private-Shared Partitioning. 2016 IEEE 23rd International Conference on High Performance Computing (HiPC), 2016. BibTeX

@article{ 10.1109/hipc.2016.051,
	author = "Valls, Joan Josep and Ros, Alberto and Gomez, Maria E. and Sahuquillo, Julio",
	journal = "2016 IEEE 23rd International Conference on High Performance Computing (HiPC)",
	title = "{A} {D}irectory {C}ache with {D}ynamic {P}rivate-{S}hared {P}artitioning",
	year = 2016
}

Roberto Peñaranda, Pedro Lopez and Maria E Gomez. A New Fault-Tolerant Routing Methodology for KNS Topologies. 2016 2nd IEEE International Workshop on High-Performance Interconnection Networks in the Exascale and Big-Data Era (HiPINEB), 2016. BibTeX

@article{ 10.1109/hipineb.2016.9,
	author = "Pe{\~n}aranda, Roberto and Lopez, Pedro and Gomez, Maria E.",
	journal = "2016 2nd IEEE International Workshop on High-Performance Interconnection Networks in the Exascale and Big-Data Era (HiPINEB)",
	title = "{A} {N}ew {F}ault-{T}olerant {R}outing {M}ethodology for {KNS} {T}opologies",
	year = 2016
}

Julio Sahuquillo, Vicent Selfa, Crispín Gomez and Maria E Gomez. A Simple Activation/Deactivation Prefetching Scheme for Chip Multiprocessors. 2016 24th Euromicro International Conference on Parallel, Distributed, and Network-Based Processing (PDP), 2016. BibTeX

@article{ 10.1109/pdp.2016.47,
	author = "Sahuquillo, Julio and Selfa, Vicent and Gomez, Crisp{\'i}n and Gomez, Maria E.",
	journal = "2016 24th Euromicro International Conference on Parallel, Distributed, and Network-Based Processing (PDP)",
	title = "{A} {S}imple {A}ctivation/{D}eactivation {P}refetching {S}cheme for {C}hip {M}ultiprocessors",
	year = 2016
}

Jose Puche, Salvador Petit and Maria E Gomez. Accurately modeling a photonic NoC in a detailed CMP simulation framework. 2016 International Conference on High Performance Computing & Simulation (HPCS), 2016. BibTeX

@article{ 10.1109/hpcsim.2016.7568361,
	author = "Puche, Jose and Petit, Salvador and Gomez, Maria E.",
	journal = "2016 International Conference on High Performance Computing {\&} Simulation (HPCS)",
	title = "{A}ccurately modeling a photonic {N}o{C} in a detailed {CMP} simulation framework",
	year = 2016
}

Javier Prades, Carlos Reaño and Federico Silla. CUDA acceleration for Xen virtual machines in infiniband clusters with rCUDA. In Proceedings of the 21st ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP 2016, Barcelona, Spain, March 12-16, 2016. 2016, 35:1–35:2. URL, DOI BibTeX

@conference{ dblp:conf/ppopp/pradesrs16,
	author = "Prades, Javier and Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "Proceedings of the 21st ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP 2016, Barcelona, Spain, March 12-16, 2016",
	crossref = "DBLP:conf/ppopp/2016",
	doi = "10.1145/2851141.2851181",
	pages = "35:1--35:2",
	title = "{CUDA} acceleration for {X}en virtual machines in infiniband clusters with r{CUDA}",
	url = "http://doi.acm.org/10.1145/2851141.2851181",
	year = 2016
}

Alejandro Valero, Salvador Petit and Julio Sahuquillo. Enhancing the L1 Data Cache Design to Mitigate HCI. IEEE Computer Architecture Letters 2(15):93-96, 2016. BibTeX

@article{ 10.1109/lca.2015.2460736,
	author = "Valero, Alejandro and Petit, Salvador and Sahuquillo, Julio",
	abstract = "Over the lifetime of a microprocessor, the Hot Carrier Injection (HCI) phenomenon degrades the threshold voltage, which causes slower transistor switching and eventually results in timing violations and faulty operation. This effect appears when the memory cell contents flip from logic ‘0’ to ‘1’ and vice versa. In caches, the majority of cell flips are concentrated into only a few of the total memory cells that make up each data word. In addition, other researchers have noted that zero is the most commonly-stored data value in a cache, and have taken advantage of this behavior to propose data compression and power reduction techniques. Contrary to these works, we use this information to extend the lifetime of the caches by introducing two microarchitectural techniques that spread and reduce the number of flips across the first-level (L1) data cache cells. Experimental results show that, compared to the conventional approach, the proposed mechanisms reduce the highest cell flip peak up to 65.8%, whereas the threshold voltage degradation savings range from 32.0% to 79.9% depending on the application.",
	journal = " IEEE Computer Architecture Letters",
	number = 15,
	pages = "93-96",
	title = "{E}nhancing the {L}1 {D}ata {C}ache {D}esign to {M}itigate {HCI}",
	volume = 2,
	year = 2016
}

Carlos Reaño and Federico Silla. Extending rCUDA with Support for P2P Memory Copies between Remote GPUs. In 18th IEEE International Conference on High Performance Computing and Communications; 14th IEEE International Conference on Smart City; 2nd IEEE International Conference on Data Science and Systems, HPCC/Smar. 2016, 789–796. URL, DOI BibTeX

@conference{ dblp:conf/hpcc/reanos16,
	author = "Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "18th IEEE International Conference on High Performance Computing and Communications; 14th IEEE International Conference on Smart City; 2nd IEEE International Conference on Data Science and Systems, HPCC/Smar",
	crossref = "DBLP:conf/hpcc/2016",
	doi = "10.1109/HPCC-SmartCity-DSS.2016.0114",
	pages = "789--796",
	title = "{E}xtending r{CUDA} with {S}upport for {P}2{P} {M}emory {C}opies between {R}emote {GPU}s",
	url = "https://doi.org/10.1109/HPCC-SmartCity-DSS.2016.0114",
	year = 2016
}

Javier Prades, Fernando Campos, Carlos Reaño and Federico Silla. GPGPU as a Service: Providing GPU-Acceleration Services to Federated Cloud Systems. In Developing Interoperable and Federated Cloud Architecture. 2016, pages 281–313. URL, DOI BibTeX

@incollection{ gpgpuaas_book_chapter10,
	author = "Prades, Javier and Campos, Fernando and Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "Developing Interoperable and Federated Cloud Architecture",
	crossref = "DBLP:reference/parallel/2011",
	doi = "10.4018/978-1-5225-0153-4.ch010",
	pages = "281--313",
	title = "{GPGPU} as a {S}ervice: {P}roviding {GPU}-{A}cceleration {S}ervices to {F}ederated {C}loud {S}ystems",
	url = "https://doi.org/10.4018/978-1-5225-0153-4.ch010",
	year = 2016
}

Francisco Candel, Salvador Petit, Julio Sahuquillo and Jose Duato. Impact of Memory-Level Parallelism on the Performance of GPU Coherence Protocols. 2016 24th Euromicro International Conference on Parallel, Distributed, and Network-Based Processing (PDP), 2016. BibTeX

@article{ 10.1109/pdp.2016.67,
	author = "Candel, Francisco and Petit, Salvador and Sahuquillo, Julio and Duato, Jose",
	journal = "2016 24th Euromicro International Conference on Parallel, Distributed, and Network-Based Processing (PDP)",
	title = "{I}mpact of {M}emory-{L}evel {P}arallelism on the {P}erformance of {GPU} {C}oherence {P}rotocols",
	year = 2016
}

Jose Vicente Escamilla Lopez and Jose Flich. Increasing the Efficiency of Latency-Driven DVFS with a Smart NoC Congestion Management Strategy. IEEE 10th International Symposium on Embedded Multicore/Many-core Systems-on-Chip (MCSoC), 2016. BibTeX

@article{ 10.1109/mcsoc.2016.42,
	author = "Escamilla Lopez, Jose Vicente and Flich, Jose",
	abstract = "Dynamic Voltage and Frequency Scaling (DVFS) can be a very effective power management strategy not only for on-chip processing elements but also for the network-on-chip (NoC). In this paper we propose a new approach to DVFS in NoC, which combines a congestion management strategy with a feedback-loop controller. The controller sets frequency and voltage to the lowest values that keep the NoC latency below a predetermined threshold. To cope with burstiness and hotspot patterns, which may lead the controller to overdrive the NoC with too high frequencies and voltages, leading to excessive power consumption, the congestion management strategy promptly identifies the flows that caused the abnormal traffic situation and eliminates them from the latency calculation, leading to a significantly higher power saving. Compared to a baseline DVFS strategy without congestion management, our results show that our proposal saves up to 53% more power when bursty or hotspot-based traffic patterns are detected. In addition, since we also apply power-gating to make an efficient use of the network buffers, we achieve an improvement of up to 38% in power savings when no bursts or hotspots are present.",
	journal = "IEEE 10th International Symposium on Embedded Multicore/Many-core Systems-on-Chip (MCSoC)",
	keywords = "NoC",
	title = "{I}ncreasing the {E}fficiency of {L}atency-{D}riven {DVFS} with a {S}mart {N}o{C} {C}ongestion {M}anagement {S}trategy",
	year = 2016
}

Sergio Iserte, Javier Prades, Carlos Reaño and Federico Silla. Increasing the Performance of Data Centers by Combining Remote GPU Virtualization with Slurm. In IEEE/ACM 16th International Symposium on Cluster, Cloud and Grid Computing, CCGrid 2016, Cartagena, Colombia, May 16-19, 2016. 2016, 98–101. URL, DOI BibTeX

@conference{ dblp:conf/ccgrid/iserteprs16,
	author = "Sergio Iserte and Prades, Javier and Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "IEEE/ACM 16th International Symposium on Cluster, Cloud and Grid Computing, CCGrid 2016, Cartagena, Colombia, May 16-19, 2016",
	crossref = "DBLP:conf/ccgrid/2016",
	doi = "10.1109/CCGrid.2016.26",
	pages = "98--101",
	title = "{I}ncreasing the {P}erformance of {D}ata {C}enters by {C}ombining {R}emote {GPU} {V}irtualization with {S}lurm",
	url = "https://doi.org/10.1109/CCGrid.2016.26",
	year = 2016
}

Josué Feliu, Salvador Petit, Julio Sahuquillo and Jose Duato. Perf&Fair: A Progress-Aware Scheduler to Enhance Performance and Fairness in SMT Multicores. EEE Transactions on Computers PP (99), 2016. BibTeX

@article{ 10.1109/tc.2016.2620977,
	author = "Feliu, Josu{\'e} and Petit, Salvador and Sahuquillo, Julio and Duato, Jose",
	abstract = "Nowadays, high performance multicore processors implement multithreading capabilities. The processes running concurrently on these processors are continuously competing for the shared resources, not only among cores, but also within the core. While resource sharing increases the resource utilization, the interference among processes accessing the shared resources can strongly affect the performance of individual processes and its predictability. In this scenario, process scheduling plays a key role to deal with performance and fairness.",
	journal = "EEE Transactions on Computers PP",
	number = 99,
	title = "{P}erf{\&}{F}air: {A} {P}rogress-{A}ware {S}cheduler to {E}nhance {P}erformance and {F}airness in {SMT} {M}ulticores",
	year = 2016
}

Carlos Reaño and Federico Silla. Performance Evaluation of the NVIDIA Pascal GPU Architecture: Early Experiences. In 18th IEEE International Conference on High Performance Computing and Communications; 14th IEEE International Conference on Smart City; 2nd IEEE International Conference on Data Science and Systems, HPCC/Smar. 2016, 1234–1235. URL, DOI BibTeX

@conference{ dblp:conf/hpcc/reanos16a,
	author = "Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "18th IEEE International Conference on High Performance Computing and Communications; 14th IEEE International Conference on Smart City; 2nd IEEE International Conference on Data Science and Systems, HPCC/Smar",
	crossref = "DBLP:conf/hpcc/2016",
	doi = "10.1109/HPCC-SmartCity-DSS.2016.0173",
	pages = "1234--1235",
	title = "{P}erformance {E}valuation of the {NVIDIA} {P}ascal {GPU} {A}rchitecture: {E}arly {E}xperiences",
	url = "https://doi.org/10.1109/HPCC-SmartCity-DSS.2016.0173",
	year = 2016
}

Ferran Perez, Carlos Reaño and Federico Silla. Providing CUDA Acceleration to KVM Virtual Machines in InfiniBand Clusters with rCUDA. In Distributed Applications and Interoperable Systems - 16th IFIP WG 6.1 International Conference, DAIS 2016, Held as Part of the 11th International Federated Conference on Distributed Computing Techniques, Dis. 2016, 82–95. URL, DOI BibTeX

@conference{ dblp:conf/dais/perezrs16,
	author = "Perez, Ferran and Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "Distributed Applications and Interoperable Systems - 16th IFIP WG 6.1 International Conference, DAIS 2016, Held as Part of the 11th International Federated Conference on Distributed Computing Techniques, Dis",
	crossref = "DBLP:conf/dais/2016",
	doi = "10.1007/978-3-319-39577-7_7",
	pages = "82--95",
	title = "{P}roviding {CUDA} {A}cceleration to {KVM} {V}irtual {M}achines in {I}nfini{B}and {C}lusters with r{CUDA}",
	url = "https://doi.org/10.1007/978-3-319-39577-7_7",
	year = 2016
}

Carlos Reaño and Federico Silla. Reducing the performance gap of remote GPU virtualization with InfiniBand Connect-IB. In IEEE Symposium on Computers and Communication, ISCC 2016, Messina, Italy, June 27-30, 2016. 2016, 920–925. URL, DOI BibTeX

@conference{ dblp:conf/iscc/reanos16,
	author = "Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "IEEE Symposium on Computers and Communication, ISCC 2016, Messina, Italy, June 27-30, 2016",
	crossref = "DBLP:conf/iscc/2016",
	doi = "10.1109/ISCC.2016.7543854",
	pages = "920--925",
	title = "{R}educing the performance gap of remote {GPU} virtualization with {I}nfini{B}and {C}onnect-{IB}",
	url = "https://doi.org/10.1109/ISCC.2016.7543854",
	year = 2016
}

Federico Silla, Javier Prades, Sergio Iserte and Carlos Reaño. Remote GPU Virtualization: Is It Useful?. In 2nd IEEE International Workshop on High-Performance Interconnection Networks in the Exascale and Big-Data Era HiPINEB@HPCA 2016, Barcelona, Spain, March 12, 2016. 2016, 41–48. URL, DOI BibTeX

@conference{ dblp:conf/hpca/sillapir16,
	author = "Silla, Federico and Prades, Javier and Sergio Iserte and Rea{\~n}o, Carlos",
	booktitle = "2nd IEEE International Workshop on High-Performance Interconnection Networks in the Exascale and Big-Data Era HiPINEB@HPCA 2016, Barcelona, Spain, March 12, 2016",
	crossref = "DBLP:conf/hpca/2016hipineb",
	doi = "10.1109/HIPINEB.2016.8",
	pages = "41--48",
	title = "{R}emote {GPU} {V}irtualization: {I}s {I}t {U}seful?",
	url = "https://doi.org/10.1109/HIPINEB.2016.8",
	year = 2016
}

Carlos Reaño, Federico Silla and Matthew J Leslie. SchedGPU: Fine-grain dynamic and adaptative scheduling for GPUs. In International Conference on High Performance Computing & Simulation, HPCS 2016, Innsbruck, Austria, July 18-22, 2016. 2016, 993–997. URL, DOI BibTeX

@conference{ dblp:conf/ieeehpcs/reanosl16,
	author = "Rea{\~n}o, Carlos and Silla, Federico and Matthew J. Leslie",
	booktitle = "International Conference on High Performance Computing {\&} Simulation, HPCS 2016, Innsbruck, Austria, July 18-22, 2016",
	crossref = "DBLP:conf/ieeehpcs/2016",
	doi = "10.1109/HPCSim.2016.7568444",
	pages = "993--997",
	title = "{S}ched{GPU}: {F}ine-grain dynamic and adaptative scheduling for {GPU}s",
	url = "https://doi.org/10.1109/HPCSim.2016.7568444",
	year = 2016
}

Julio Sahuquillo, Josué Feliu and Salvador Petit. Symbiotic job scheduling on the IBM POWER8. 2016 IEEE International Symposium on High Performance Computer Architecture (HPCA), 2016. BibTeX

@article{ 10.1109/hpca.2016.7446103,
	author = "Sahuquillo, Julio and Feliu, Josu{\'e} and Petit, Salvador",
	journal = "2016 IEEE International Symposium on High Performance Computer Architecture (HPCA)",
	title = "{S}ymbiotic job scheduling on the {IBM} {POWER}8",
	year = 2016
}

Joan Josep Valls, Alberto Ros, Maria E Gomez and Julio Sahuquillo. The Tag Filter Architecture: An energy-efficient cache and directory design. Journal of Parallel and Distributed Computing (100), 2016. BibTeX

@article{ 10.1016/j.jpdc.2016.04.016,
author = "Valls, Joan Josep and Ros, Alberto and Gomez, Maria E. and Sahuquillo, Julio",
abstract = "Power consumption in current high-performance chip multiprocessors (CMPs) has become a major design concern that aggravates with the current trend of increasing the core count. A significant fraction of the total power budget is consumed by on-chip caches which are usually deployed with a high associativity degree (even L1 caches are being implemented with eight ways) to enhance the system performance. On a cache access, each way in the corresponding set is accessed in parallel, which is costly in terms of energy. On the other hand, coherence protocols also must implement efficient directory caches that scale in terms of power consumption. Most of the state-of-the-art techniques that reduce the energy consumption of directories are at the cost of performance, which may become unacceptable for high-performance CMPs. In this paper, we propose an energy-efficient architectural design that can be effectively applied to any kind of cache memory. The proposed approach, called the Tag Filter (TF) Architecture, filters the ways accessed in the target cache set, and just a few ways are searched in the tag and data arrays. This allows the approach to reduce the dynamic energy consumption of caches without hurting their access time. For this purpose, the proposed architecture holds the X least significant bits of each tag in a small auxiliary X-bit-wide array. These bits are used to filter the ways where the least significant bits of the tag do not match with the bits in the X-bit array. Experimental results show that, on average, the TF Architecture reduces the dynamic power consumption across the studied applications up to 74.9%, 85.9%, and 84.5% when applied to L1 caches, L2 caches, and directory caches, respectively.",
journal = "Journal of Parallel and Distributed Computing",
number = 100,
title = "{T}he {T}ag {F}ilter {A}rchitecture: {A}n energy-efficient cache and directory design",
year = 2016
}

Julio Sahuquillo, Houcine Hassan Mohamed, Salvador Petit, Jose Duato and José Luis March. A dynamic execution time estimation model to save energy in heterogeneous multicores running periodic tasks. Future Generation Computer Systems (56), 2015. BibTeX

@article{ 10.1016/j.future.2015.06.011,
	author = "Sahuquillo, Julio and Mohamed, Houcine Hassan and Petit, Salvador and Duato, Jose and March, Jos{\'e} Luis",
	abstract = "Nowadays, real-time embedded applications have to cope with an increasing demand of functionalities, which require increasing processing capabilities. With this aim real-time systems are being implemented on top of high-performance multicore processors that run multithreaded periodic workloads by allocating threads to individual cores. In addition, to improve both performance and energy savings, the industry is introducing new multicore designs such as ARM’s big.LITTLE that include heterogeneous cores in the same package. A key issue to improve energy savings in multicore embedded real-time systems and reduce the number of deadline misses is to accurately estimate the execution time of the tasks considering the supported processor frequencies. Two main aspects make this estimation difficult. First, the running threads compete among them for shared resources. Second, almost all current microprocessors implement Dynamic Voltage and Frequency Scaling (DVFS) regulators to dynamically adjust the voltage/frequency at run-time according to the workload behavior. Existing execution time estimation models rely on off-line analysis or on the assumption that the task execution time scales linearly with the processor frequency, which can bring important deviations since the memory system uses a different power supply. In contrast, this paper proposes the Processor–Memory (Proc–Mem) model, which dynamically predicts the distinct task execution times depending on the implemented processor frequencies. A power-aware EDF (Earliest Deadline First)-based scheduler using the Proc–Mem approach has been evaluated and compared against the same scheduler using a typical Constant Memory Access Time model, namely CMAT. Results on a heterogeneous multicore processor show that the average deviation of Proc–Mem is only by 5.55% with respect to the actual measured execution time, while the average deviation of the CMAT model is 36.42%. These results turn in important energy savings, by 18% on average and up to 31% in some mixes, in comparison to CMAT for a similar number of deadline misses.",
	journal = "Future Generation Computer Systems",
	number = 56,
	title = "{A} dynamic execution time estimation model to save energy in heterogeneous multicores running periodic tasks",
	year = 2015
}

Carlos Reaño and Federico Silla. A Live Demo on Remote GPU Accelerated Deep Learning Using the rCUDA Middleware. In Proceedings of the Posters and Demos Session of the 16th International Middleware Conference, Middleware Posters and Demos 2015, Vancouver, BC, Canada, December 7-11, 2015. 2015, 3:1–3:2. URL, DOI BibTeX

@conference{ dblp:conf/middleware/reanos15,
	author = "Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "Proceedings of the Posters and Demos Session of the 16th International Middleware Conference, Middleware Posters and Demos 2015, Vancouver, BC, Canada, December 7-11, 2015",
	crossref = "DBLP:conf/middleware/2015pd",
	doi = "10.1145/2830894.2830897",
	pages = "3:1--3:2",
	title = "{A} {L}ive {D}emo on {R}emote {GPU} {A}ccelerated {D}eep {L}earning {U}sing the r{CUDA} {M}iddleware",
	url = "http://doi.acm.org/10.1145/2830894.2830897",
	year = 2015
}

Carlos Reaño and Federico Silla. A Performance Comparison of CUDA Remote GPU Virtualization Frameworks. In 2015 IEEE International Conference on Cluster Computing, CLUSTER 2015, Chicago, IL, USA, September 8-11, 2015. 2015, 488–489. URL, DOI BibTeX

@conference{ dblp:conf/cluster/reanos15,
	author = "Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "2015 IEEE International Conference on Cluster Computing, CLUSTER 2015, Chicago, IL, USA, September 8-11, 2015",
	crossref = "DBLP:conf/cluster/2015",
	doi = "10.1109/CLUSTER.2015.76",
	pages = "488--489",
	title = "{A} {P}erformance {C}omparison of {CUDA} {R}emote {GPU} {V}irtualization {F}rameworks",
	url = "https://doi.org/10.1109/CLUSTER.2015.76",
	year = 2015
}

Blesson Varghese, Javier Prades, Carlos Reaño and Federico Silla. Acceleration-as-a-Service: Exploiting Virtualised GPUs for a Financial Application. In 11th IEEE International Conference on e-Science, e-Science 2015, Munich, Germany, August 31 - September 4, 2015. 2015, 47–56. URL, DOI BibTeX

@conference{ dblp:conf/escience/vargheseprs15,
	author = "Blesson Varghese and Prades, Javier and Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "11th IEEE International Conference on e-Science, e-Science 2015, Munich, Germany, August 31 - September 4, 2015",
	crossref = "DBLP:conf/eScience/2015",
	doi = "10.1109/eScience.2015.15",
	pages = "47--56",
	title = "{A}cceleration-as-a-{S}ervice: {E}xploiting {V}irtualised {GPU}s for a {F}inancial {A}pplication",
	url = "https://doi.org/10.1109/eScience.2015.15",
	year = 2015
}

Francisco Candel, Salvador Petit, Julio Sahuquillo and Jose Duato. Accurately modeling the GPU memory subsystem. 2015 International Conference on High Performance Computing & Simulation (HPCS), 2015. BibTeX

@article{ 10.1109/hpcsim.2015.7237038,
	author = "Candel, Francisco and Petit, Salvador and Sahuquillo, Julio and Duato, Jose",
	journal = "2015 International Conference on High Performance Computing {\&} Simulation (HPCS)",
	title = "{A}ccurately modeling the {GPU} memory subsystem",
	year = 2015
}

Josué Feliu, Julio Sahuquillo, Salvador Petit and Jose Duato. Addressing Fairness in SMT Multicores with a Progress-Aware Schedule. IEEE InternationalParallel and Distributed Processing Symposium (IPDPS), 2015. BibTeX

@article{ 10.1109/ipdps.2015.48,
	author = "Feliu, Josu{\'e} and Sahuquillo, Julio and Petit, Salvador and Duato, Jose",
	journal = "IEEE InternationalParallel and Distributed Processing Symposium (IPDPS)",
	title = "{A}ddressing {F}airness in {SMT} {M}ulticores with a {P}rogress-{A}ware {S}chedule",
	year = 2015
}

Alejandro Valero, Salvador Petit, Julio Sahuquillo and Jose Duato. Article A reuse-based refresh policy for energy-aware eDRAM caches. 1(39):37-48, 2015. BibTeX

@article{ 10.1016/j.micpro.2014.12.001,
author = "Valero, Alejandro and Petit, Salvador and Sahuquillo, Julio and Duato, Jose",
abstract = "DRAM technology requires refresh operations to be performed in order to avoid data loss due to capacitance leakage. Refresh operations consume a significant amount of dynamic energy, which increases with the storage capacity. To reduce this amount of energy, prior work has focused on reducing refreshes in off-chip memories. However, this problem also appears in on-chip eDRAM memories implemented in current low-level caches. The refresh energy can dominate the dynamic consumption when a high percentage of the chip area is devoted to eDRAM cache structures. Replacement algorithms for high-associativity low-level caches select the victim block avoiding blocks more likely to be reused soon. This paper combines the state-of-the-art MRUT replacement algorithm with a novel refresh policy. Refresh operations are performed based on information produced by the replacement algorithm. The proposed refresh policy is implemented on top of an energy-aware eDRAM cache architecture, which implements bank-prediction and swap operations to save energy. Experimental results show that, compared to a conventional eDRAM design, the proposed energy-aware cache can achieve by 72% refresh energy savings. Considering the entire on-chip memory hierarchy consumption, the overall energy savings are 30%. These benefits come with minimal impact on performance (by 1.2%) and area overhead (by 0.4%).",
number = 39,
pages = "37-48",
title = "{A}rticle {A} reuse-based refresh policy for energy-aware e{DRAM} caches",
volume = 1,
year = 2015
}

Josué Feliu, Salvador Petit, Julio Sahuquillo and Jose Duato. Bandwidth-Aware On-Line Scheduling in SMT Multicores. IEEE Transactions on Computers 1(65), 2015. BibTeX

@article{ 10.1109/tc.2015.2428694,
	author = "Feliu, Josu{\'e} and Petit, Salvador and Sahuquillo, Julio and Duato, Jose",
	abstract = "The memory hierarchy plays a critical role on the performance of current chip multiprocessors. Main memory is shared by all the running processes, which can cause important bandwidth contention. In addition, when the processor implements SMT cores, the L1 bandwidth becomes shared among the threads running on each core. In such a case, bandwidth-aware schedulers emerge as an interesting approach to mitigate the contention. This work investigates the performance degradation that the processes suffer due to memory bandwidth constraints. Experiments show that main memory and L1 bandwidth contention negatively impact the process performance; in both cases, performance degradation can grow up to 40% for some of applications. To deal with contention, we devise a scheduling algorithm that consists of two policies guided by the bandwidth consumption gathered at runtime. The process selection policy balances the number of memory requests over the execution time to address main memory bandwidth contention. The process allocation policy tackles L1 bandwidth contention by balancing the L1 accesses among the L1 caches. The proposal is evaluated on a Xeon E5645 platform using a wide set of multiprogrammed workloads, achieving performance benefits up to 6.7% with respect to the Linux scheduler.",
	journal = "IEEE Transactions on Computers",
	number = 65,
	title = "{B}andwidth-{A}ware {O}n-{L}ine {S}cheduling in {SMT} {M}ulticores",
	volume = 1,
	year = 2015
}

Alejandro Valero, Julio Sahuquillo, Salvador Petit and Jose Duato. Design of Hybrid Second-Level Caches. IEEE Transactions on Computers 7(64):1884-1897, 2015. BibTeX

@article{ 10.1109/tc.2014.2346185,
	author = "Valero, Alejandro and Sahuquillo, Julio and Petit, Salvador and Duato, Jose",
	abstract = "In recent years, embedded dynamic random-access memory (eDRAM) technology has been implemented in last-level caches due to its low leakage energy consumption and high density. However, the fact that eDRAM presents slower access time than static RAM (SRAM) technology has prevented its inclusion in higher levels of the cache hierarchy. This paper proposes to mingle SRAM and eDRAM banks within the data array of second-level (L2) caches. The main goal is to achieve the best trade-off among performance, energy, and area. To this end, two main directions have been followed. First, this paper explores the optimal percentage of banks for each technology. Second, the cache controller is redesigned to deal with performance and energy. Performance is addressed by keeping the most likely accessed blocks in fast SRAM banks. In addition, energy savings are further enhanced by avoiding unnecessary destructive reads of eDRAM blocks. Experimental results show that, compared to a conventional SRAM L2 cache, a hybrid approach requiring similar or even lower area speedups the performance on average by 5.9 percent, while the total energy savings are by 32 percent. For a 45 nm technology node, the energy-delay-area product confirms that a hybrid cache is a better design than the conventional SRAM cache regardless of the number of eDRAM banks, and also better than a conventional eDRAM cache when the number of SRAM banks is an eighth of the total number of cache banks.",
	journal = "IEEE Transactions on Computers",
	number = 64,
	pages = "1884-1897",
	title = "{D}esign of {H}ybrid {S}econd-{L}evel {C}aches",
	volume = 7,
	year = 2015
}

Carlos Reaño, Federico Silla, Adrián Castelló, Antonio Pe J na, Rafael Mayo, Enrique S Quintana-Ortí and Jose Duato. Improving the user experience of the rCUDA remote GPU virtualization framework. Concurrency and Computation: Practice and Experience 27(14):3746–3770, 2015. URL, DOI BibTeX

@article{ dblp:journals/concurrency/reanosgpmqd15,
	author = "Rea{\~n}o, Carlos and Silla, Federico and Adri{\'a}n Castell{\'o} and Antonio J. Pe na and Rafael Mayo and Enrique S. Quintana-Ort{\'i} and Duato, Jose",
	doi = "10.1002/cpe.3409",
	journal = "Concurrency and Computation: Practice and Experience",
	number = 14,
	pages = "3746--3770",
	title = "{I}mproving the user experience of the r{CUDA} remote {GPU} virtualization framework",
	url = "https://doi.org/10.1002/cpe.3409",
	volume = 27,
	year = 2015
}

Carlos Reaño and Federico Silla. InfiniBand Verbs Optimizations for Remote GPU Virtualization. In 2015 IEEE International Conference on Cluster Computing, CLUSTER 2015, Chicago, IL, USA, September 8-11, 2015. 2015, 825–832. URL, DOI BibTeX

@conference{ dblp:conf/cluster/reanos15a,
	author = "Rea{\~n}o, Carlos and Silla, Federico",
	booktitle = "2015 IEEE International Conference on Cluster Computing, CLUSTER 2015, Chicago, IL, USA, September 8-11, 2015",
	crossref = "DBLP:conf/cluster/2015",
	doi = "10.1109/CLUSTER.2015.139",
	pages = "825--832",
	title = "{I}nfini{B}and {V}erbs {O}ptimizations for {R}emote {GPU} {V}irtualization",
	url = "https://doi.org/10.1109/CLUSTER.2015.139",
	year = 2015
}

Carlos Reaño, Federico Silla, Gilad Shainer and Scot Schultz. Local and Remote GPUs Perform Similar with EDR 100G InfiniBand. In Proceedings of the Industrial Track of the 16th International Middleware Conference, Middleware Industry 2015, Vancouver, BC, Canada, December 7-11, 2015. 2015, 4:1–4:7. URL, DOI BibTeX

@conference{ dblp:conf/middleware/reanosss15,
	author = "Rea{\~n}o, Carlos and Silla, Federico and Gilad Shainer and Scot Schultz",
	booktitle = "Proceedings of the Industrial Track of the 16th International Middleware Conference, Middleware Industry 2015, Vancouver, BC, Canada, December 7-11, 2015",
	crossref = "DBLP:conf/middleware/2015i",
	doi = "10.1145/2830013.2830015",
	pages = "4:1--4:7",
	title = "{L}ocal and {R}emote {GPU}s {P}erform {S}imilar with {EDR} 100{G} {I}nfini{B}and",
	url = "http://doi.acm.org/10.1145/2830013.2830015",
	year = 2015
}

Vicent Selfa, Julio Sahuquillo, Crispín Gomez and Maria E Gomez. Methodologies and Performance Metrics to Evaluate Multiprogram Workloads. 23rd Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP), 2015. BibTeX

@article{ 10.1109/pdp.2015.74,
	author = "Selfa, Vicent and Sahuquillo, Julio and Gomez, Crisp{\'i}n and Gomez, Maria E.",
	journal = "23rd Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)",
	title = "{M}ethodologies and {P}erformance {M}etrics to {E}valuate {M}ultiprogram {W}orkloads",
	year = 2015
}

Carlos Reaño, Ferran Perez and Federico Silla. On the Design of a Demo for Exhibiting rCUDA. In 15th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing, CCGrid 2015, Shenzhen, China, May 4-7, 2015. 2015, 1169–1172. URL, DOI BibTeX

@conference{ dblp:conf/ccgrid/reanops15,
	author = "Rea{\~n}o, Carlos and Perez, Ferran and Silla, Federico",
	booktitle = "15th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing, CCGrid 2015, Shenzhen, China, May 4-7, 2015",
	crossref = "DBLP:conf/ccgrid/2015",
	doi = "10.1109/CCGrid.2015.53",
	pages = "1169--1172",
	title = "{O}n the {D}esign of a {D}emo for {E}xhibiting r{CUDA}",
	url = "https://doi.org/10.1109/CCGrid.2015.53",
	year = 2015
}

Joan Josep Valls, Julio Sahuquillo, Alberto Ros and Maria E Gomez. PS directory: A scalable multilevel directory cache for CMPs. The Journal of Supercomputing 8(71):2847-2876, 2015. BibTeX

@article{ 10.1007/s11227-014-1332-5,
	author = "Valls, Joan Josep and Sahuquillo, Julio and Ros, Alberto and Gomez, Maria E.",
	abstract = "As the number of cores increases in current and future chip-multiprocessor (CMP) generations, coherence protocols must rely on novel hardware structures to scale in terms of performance, power, and area. Systems that use directory information for coherence purposes are currently the most scalable alternative. This paper studies the important differences between the directory behavior of private and shared blocks, which claim for a separate management of both types of blocks at the directory. We propose the PS directory, a two-level directory cache that keeps the reduced number of frequently accessed shared entries in a small and fast first-level cache, namely Shared cache, and uses a larger and slower second-level Private cache to track the large amount of private blocks. Entries in the Private cache do not implement the sharer vector, which allows important silicon area savings. Speed and area reasons suggest the use of eDRAM technology, much denser but slower than SRAM technology, for the Private cache, which in turn brings energy savings. Experimental results for a 16-core CMP show that, compared to a conventional directory, the PS directory improves performance by 14 % while reducing silicon area and energy consumption by 34 and 27 %, respectively. Also, compared to the state-of-the-art Multi-Grain Directory, the PS directory apart from increasing performance, it reduces power by 18.7 %, and provides more scalability in terms of area.",
	journal = "The Journal of Supercomputing",
	number = 71,
	pages = "2847-2876",
	title = "{PS} directory: {A} scalable multilevel directory cache for {CMP}s",
	volume = 8,
	year = 2015
}

Joan Josep Valls, Alberto Ros, Maria E Gomez and Julio Sahuquillo. PS-Cache: an energy-efficient cache design for chip multiprocessors. The Journal of Supercomputing 1(71):67-86, 2015. BibTeX

@article{ 10.1007/s11227-014-1288-5,
	author = "Valls, Joan Josep and Ros, Alberto and Gomez, Maria E. and Sahuquillo, Julio",
	abstract = "Power consumption has become a major design concern in current high- performance chip multiprocessors, and this problem exacerbates with the number of core counts. A significant fraction of the total power budget is often consumed by on-chip caches, thus important research has focused on reducing energy consumption in these structures. To enhance performance, on-chip caches are being deployed with a high associativity degree. Consequently, accessing concurrently all the ways in the cache set is costly in terms of energy. This paper presents the PS-Cache architecture, an energy-efficient cache design that reduces the number of accessed ways without hurting the performance. The PS-Cache takes advantage of the private-shared knowl- edge of the referenced block to reduce energy by accessing only those ways holding the kind of block looked up. Experimental results show that, on average, the PS-Cache architecture can reduce the dynamic energy consumption of L1 and L2 caches by 22 and 40%, respectively.",
	journal = "The Journal of Supercomputing",
	number = 71,
	pages = "67-86",
	title = "{PS}-{C}ache: an energy-efficient cache design for chip multiprocessors",
	volume = 1,
	year = 2015
}

Joan Josep Valls, Julio Sahuquillo, Alberto Ros and Maria E Gomez. The Tag Filter Cache: An Energy-Efficient Approach. 23rd Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP), 2015. BibTeX

@article{ 10.1109/pdp.2015.58,
	author = "Valls, Joan Josep and Sahuquillo, Julio and Ros, Alberto and Gomez, Maria E.",
	abstract = "Power consumption in current high-performance chip multiprocessors (CMPs) has become a major design concern.The current trend of increasing the core count aggravates this problem. On-chip caches consume a signiﬁcant fraction of the total power budget. Most of the proposed techniques to reduce the energy consumption of these memory structures are at the cost of performance, which may become unacceptable for high-performance CMPs. On-chip caches in multi-core systems are usually deployed with a high associativity degree in order to enhance performance. Even ﬁrst-level caches are currently implemented with eight ways. The concurrent access to all the ways in the cache set is costly in terms of energy.",
	journal = "23rd Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)",
	title = "{T}he {T}ag {F}ilter {C}ache: {A}n {E}nergy-{E}fficient {A}pproach",
	year = 2015
}

Antonio José Peña, Carlos Reaño, Federico Silla, Rafael Mayo, Enrique S Quintana-Ortí and Jose Duato. A complete and efficient CUDA-sharing solution for HPC clusters. Parallel Computing 40(10):574–588, 2014. URL, DOI BibTeX

@article{ dblp:journals/pc/penarsmqd14,
	author = "Pe{\~n}a, Antonio Jos{\'e} and Rea{\~n}o, Carlos and Silla, Federico and Rafael Mayo and Enrique S. Quintana-Ort{\'i} and Duato, Jose",
	doi = "10.1016/j.parco.2014.09.011",
	journal = "Parallel Computing",
	number = 10,
	pages = "574--588",
	title = "{A} complete and efficient {CUDA}-sharing solution for {HPC} clusters",
	url = "http://dx.doi.org/10.1016/j.parco.2014.09.011",
	volume = 40,
	year = 2014
}

Josué Feliu, Julio Sahuquillo, Salvador Petit and Jose Duato. Addressing bandwidth contention in SMT multicores through scheduling. In International Conference on Supercomputing, ICS'14. 2014, 167. BibTeX

@conference{ dblp:conf/ics/feliuspd14,
	author = "Feliu, Josu{\'e} and Sahuquillo, Julio and Petit, Salvador and Duato, Jose",
	booktitle = "International Conference on Supercomputing, ICS'14",
	crossref = "DBLP:conf/ics/2014",
	pages = 167,
	title = "{A}ddressing bandwidth contention in {SMT} multicores through scheduling",
	year = 2014
}

Carlos Reaño, Federico Silla, Antonio José Peña, Gilad Shainer, Scot Schultz, Adrián Castelló Gimeno, Enrique S Quintana-Ortí and Jose Duato. Boosting the performance of remote GPU virtualization using InfiniBand connect-IB and PCIe 3.0. In 2014 IEEE International Conference on Cluster Computing, CLUSTER 2014, Madrid, Spain, September 22-26, 2014. 2014, 266–267. URL, DOI BibTeX

@conference{ dblp:conf/cluster/reanospssgqd14,
	author = "Rea{\~n}o, Carlos and Silla, Federico and Pe{\~n}a, Antonio Jos{\'e} and Gilad Shainer and Scot Schultz and Adri{\'a}n Castell{\'o} Gimeno and Enrique S. Quintana-Ort{\'i} and Duato, Jose",
	booktitle = "2014 IEEE International Conference on Cluster Computing, CLUSTER 2014, Madrid, Spain, September 22-26, 2014",
	crossref = "DBLP:conf/cluster/2014",
	doi = "10.1109/CLUSTER.2014.6968737",
	pages = "266--267",
	title = "{B}oosting the performance of remote {GPU} virtualization using {I}nfini{B}and connect-{IB} and {PCI}e 3.0",
	url = "http://dx.doi.org/10.1109/CLUSTER.2014.6968737",
	year = 2014
}